%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.12", %%% date = "10 January 2025", %%% time = "10:09:14 MST", %%% filename = "ieeecomputarchitlett.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% URL = "https://www.math.utah.edu/~beebe", %%% checksum = "07634 35489 180585 1838204", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "BibTeX; bibliography; IEEE Computer %%% Architecture Letters", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is a COMPLETE bibliography of %%% publications in the journal IEEE Computer %%% Architecture Letters (CODEN none, ISSN %%% 1556-6056 (print), 1556-6064 (electronic)). %%% Publication began with volume 1, number 1, %%% in January 2002, and there was only one %%% issue per annual volume through 2005. Since %%% volume 5 (2006), there are only two issues %%% per volume. %%% %%% The journal has Web sites at %%% %%% https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208 %%% https://ieeexplore.ieee.org/xpl/issues?punumber=10208&isnumber=8610345 %%% %%% At version 1.12, the COMPLETE year coverage %%% looked like this: %%% %%% 2002 ( 12) 2010 ( 32) 2018 ( 61) %%% 2003 ( 7) 2011 ( 25) 2019 ( 45) %%% 2004 ( 9) 2012 ( 27) 2020 ( 42) %%% 2005 ( 2) 2013 ( 29) 2021 ( 45) %%% 2006 ( 18) 2014 ( 36) 2022 ( 38) %%% 2007 ( 14) 2015 ( 52) 2023 ( 41) %%% 2008 ( 21) 2016 ( 49) 2024 ( 61) %%% 2009 ( 34) 2017 ( 42) 2025 ( 3) %%% %%% Article: 745 %%% %%% Total entries: 745 %%% %%% Data for this bibliography have been derived %%% primarily from the publisher Web site, and %%% from the Web of Science Web site. %%% %%% Numerous errors in the Web sources noted %%% above have been corrected. Spelling has been %%% verified with the UNIX spell and GNU ispell %%% programs using the exception dictionary %%% stored in the companion file with extension %%% .sok. %%% %%% BibTeX citation tags are uniformly chosen %%% as name:year:abbrev, where name is the %%% family name of the first author or editor, %%% year is a 4-digit number, and abbrev is a %%% 3-letter condensation of important title %%% words. Citation tags were automatically %%% generated by software developed for the %%% BibNet Project. %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility.", %%% } %%% ==================================================================== @Preamble{ "\ifx \undefined \booktitle \def \booktitle#1{{{\em #1}}} \fi" } %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|https://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Journal abbreviations: @String{j-IEEE-COMPUT-ARCHIT-LETT = "IEEE Computer Architecture Letters"} %%% ==================================================================== %%% Bibliography entries, sorted in publication order with ``bibsort %%% --byvolume'': @Article{Alvarez:2002:IRF, author = "C. Alvarez and J. Corbal and E. Salami and M. Valero", title = "Initial Results on Fuzzy Floating Point Computation for Multimedia Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "1", number = "1", pages = "1--1", month = jan, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2002.6", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "During the recent years the market of mid low end portable systems such as PDAs or mobile digital phones have experimented a revolution in both selling volume and features as handheld devices incorporate Multimedia applications. This fact brings to an increase in the computational demands of the devices while still having the limitation of power and energy consumption. Instruction memoization is a promising technique to help alleviate the problem of power consumption of expensive functional units such as the floating point one. Unfortunately this technique could be energy inefficient for low end systems due to the additional power consumption of the relatively big tables required. In this paper we present a novel way of understanding multimedia floating point operations based on the fuzzy computation paradigm losses in the computation precision may exchange performance for negligible errors in the output. Exploiting the implicit characteristics of media FP computation we propose a new technique called fuzzy memoization. Fuzzy memoization expands the capabilities of classic memoization by attaching entries with similar inputs to the same output. We present a case of study for a SH like processor and report good performance and power delay improvements with feasible hardware requirements", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Delay; Energy consumption; Fuzzy systems; Handheld computers; Joining processes; Mobile computing; Multimedia systems; Performance loss; Personal digital assistants; Portable computers", } @Article{Gordon-Ross:2002:EFP, author = "A. Gordon-Ross and S. Cotterell and F. Vahid", title = "Exploiting Fixed Programs in Embedded Systems: a Loop Cache Example", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "1", number = "1", pages = "2--2", month = jan, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2002.4", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Embedded systems commonly execute one program for their lifetime. Designing embedded system architectures with configurable components, such that those components can be tuned to that one program based on a program pre-analysis, can yield significant power and performance benefits. We illustrate such benefits by designing a loop cache specifically with tuning in mind. Our results show a 70\% reduction in instruction memory access, for MIPS and 8051 processors representing twice the reduction from a regular loop cache, translating to good power savings.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "architecture tuning; Computer architecture; Computer science; Costs; Digital cameras; Embedded computing; Embedded system; embedded systems.; fixed program; Loop cache; low power; Microcomputers; Microprocessor chips; Portable computers; Power engineering computing", } @Article{Choi:2002:LPT, author = "Jin-Hyuck Choi and Jung-Hoon Lee and Seh-Woong Jeong and Shin-Dug Kim and C. Weems", title = "A Low Power {TLB} Structure for Embedded Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "1", number = "1", pages = "3--3", month = jan, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2002.1", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We present a new two-level TLB (translation look-aside buffer) architecture that integrates a 2-way banked filter TLB with a 2-way banked main TLB. The objective is to reduce power consumption in embedded processors by distributing the accesses to TLB entries across the banks in a balanced manner. First, an advanced filtering technique is devised to reduce access power by adopting a sub-bank structure. Second, a bank-associative structure is applied to each level of the TLB hierarchy. Simulation results show that the Energy*Delay product can be reduced by about 40.9\% compared to a fully associative TLB, 24.9\% compared to a micro-TLB with 4+32 entries, and 12.18\% compared to a micro-TLB with 16+32 entries.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bank associative structure; CADCAM; Circuits; Computer aided manufacturing; Degradation; Embedded system; Energy consumption; Filter bank; filter mechanism; Filtering; low power design; Power filters; translation look-aside buffer; Virtual private networks", } @Article{Towles:2002:WCT, author = "B. Towles and W. J. Dally", title = "Worst-case Traffic for Oblivious Routing Functions", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "1", number = "1", pages = "4--4", month = jan, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2002.12", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper presents an algorithm to find a worst-case traffic pattern for any oblivious routing algorithm on an arbitrary interconnection network topology. The linearity of channel loading offered by oblivious routing algorithms enables the problem to be mapped to a bipartite maximum-weight matching, which can be solved in polynomial time for routing functions with a polynomial number of paths. Finding exact worst case performance was previously intractable, and we demonstrate an example case where traditional characterization techniques overestimate the throughput of a particular routing algorithm by 47\%.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bipartite graph; Linearity; Multiprocessor interconnection networks; Network topology; oblivious routing; Pattern matching; Polynomials; Routing; Telecommunication traffic; Throughput; worst-case throughput", } @Article{Unsal:2002:CFC, author = "O. S. Unsal and C. M. Krishna and C. A. Mositz", title = "{Cool-Fetch}: Compiler-Enabled Power-Aware Fetch Throttling", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "1", number = "1", pages = "5--5", month = jan, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2002.3", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "In this paper, we present an architecture compiler based approach to reduce energy consumption in the processor. While we mainly target the fetch unit, an important side-effect of our approach is that we obtain energy savings in many other parts in the processor. The explanation is that the fetch unit often runs substantially ahead of execution, bringing in instructions to different stages in the processor that may never be executed. We have found, that although the degree of Instruction Level Parallelism (ILP)of a program tends to vary over time, it can be statically predicted by the compiler with considerable accuracy. Our Instructions Per Clock (IPC) prediction scheme is using a dependence-testing-based analysis and simple heuristics, to guide a front-end fetch-throttling mechanism. We develop the necessary architecture support and include its power overhead. We perform experiments over a wide number of architectural configurations, using SPEC2000 applications. Our results are very encouraging: we obtain up to 15\%total energy savings in the processor with generally little performance degradation. In fact, in some cases our intelligent throttling scheme even increases performance.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Clocks; compiler architecture interaction; Degradation; Energy consumption; fetch-throttling; instruction level parallelism; Low power design; Program processors", } @Article{Shang:2002:PEI, author = "Li Shang and L. Peh and N. K. Jha", title = "Power-efficient Interconnection Networks: Dynamic Voltage Scaling with Links", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "1", number = "1", pages = "6--6", month = jan, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2002.10", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Power consumption is a key issue in high performance interconnection network design. Communication links, already a significant consumer of power now, will take up an ever larger portion of the power budget as demand for network bandwidth increases. In this paper, we motivate the use of dynamic voltage scaling (DVS) for links, where the frequency and voltage of links are dynamically adjusted to minimize power consumption. We propose a history-based DVS algorithm that judiciously adjusts DVS policies based on past link utilization. Despite every conservative assumptions about DVS link characteristics, our approach realizes up to 4.5X power savings (3.2X average), with just an average 27.4\% latency increase and 2.5\% throughput reduction. To the best of our knowledge, this is the first study that targets dynamic power optimization of interconnection networks.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Clocks; Dynamic voltage scaling; Frequency synthesizers; interconnection network; Multiprocessor interconnection networks; power optimization.; Regulators", } @Article{KleinOsowski:2002:MNS, author = "A. J. KleinOsowski and D. J. Lilja", title = "{MinneSPEC}: a New {SPEC} Benchmark Workload for Simulation-Based Computer Architecture Research", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "1", number = "1", pages = "7--7", month = jan, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2002.8", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Computer architects must determine how to most effectively use finite computational resources when running simulations to evaluate new architectural ideas. To facilitate efficient simulations with a range of benchmark programs, we have developed the MinneSPEC input set for the SPEC CPU 2000 benchmark suite. This new workload allows computer architects to obtain simulation results in a reasonable time using existing simulators. While the MinneSPEC workload is derived from the standard SPEC CPU 2000 work load, it is a valid benchmark suite in and of itself for simulation-based research. MinneSPEC also may be used to run large numbers of simulations to find ``sweet spots'' in the evaluation parameters pace. This small number of promising design points subsequently may be investigated in more detail with the full SPEC reference workload. In the process of developing the MinneSPEC datasets, we quantify its differences in terms of function-level execution patterns, instruction mixes, and memory behaviors compared to the SPEC programs when executed with the reference inputs. We find that for some programs, the MinneSPEC profiles match the SPEC reference dataset program behavior very closely. For other programs, however, the MinneSPEC inputs produce significantly different program behavior. The MinneSPEC workload has been recognized by SPEC and is distributed with Version 1.2 and higher of the SPEC CPU 2000 benchmark suite.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computational modeling; Computer architecture; Computer simulation", } @Article{Vandierendonck:2002:ATC, author = "H. Vandierendonck and K. {De Bosschere}", title = "An Address Transformation Combining Block- and Word-Interleaving", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "1", number = "1", pages = "8--8", month = jan, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2002.2", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "As future superscalar processors employ higher issue widths, an increasing number of load/store instructions needs to be executed each cycle to sustain high performance. Multi-bank data caches attempt to address this issue in a cost-effective way. R multi-bank cache consists of multiple cache banks that each support one load/store instruction per clock cycle. The interleaving of cache blocks over the banks is of primary importance. Two common choices are block-interleaving and word-interleaving. AC through word-interleaving leads to higher PC, it is more expensive to implement than block-interleaving since it requires the tag array of the cache to be multi-ported. By swapping the bits in the effective address that are used by word-interleaving with those used by block-interleaving, it is possible to implement a word-interleaved cache with the same cost, cycle time and power consumption of a block interleaved cache. Because this makes the L1 data cache blocks sparse, additional costs are incurred at different levels of the memory hierarchy.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Block-Interleaving; Clocks; Costs; Data cache; Energy consumption; Interleaved codes; Multi-Banking; Word-Interleaving.", } @Article{Tambat:2002:PLB, author = "S. Tambat and S. Vajapeyam", title = "Page-Level Behavior of Cache Contention", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "1", number = "1", pages = "9--9", month = jan, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2002.9", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Cache misses in small, limited-associativity primary caches very often replace live cache blocks, given the dominance of capacity and conflict misses. Towards motivating novel cache organizations, we study the comparative characteristics of the virtual memory address pairs involved in typical primary-cache contention (block replacements) for the SPEC2000integer benchmarks. We focus on the cache tag bits, and results show that (i) often just a few tag bits differ between contending addresses, and (ii) accesses to certain segments or page groups of the virtual address space (i.e., certain tag-bit groups) contend frequently. Cache conscious virtual address space allocation can further reduce the number of conflicting tag bits. We mention two directions for exploiting such page-level contention patterns to improve cache cost and performance.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Automation; Benchmark testing; Cache Contention; Cache Tags; Computer science; Data Cache; Libraries; Memory Access Characterization; Microprocessors; Optimizing compilers; Traffic control; Workstations", } @Article{Juang:2002:IDT, author = "Philo Juang and P. Diodato and S. Kaxiras and K. Skadron and Zhigang Hu and M. Martonosi and D. W. Clark", title = "Implementing Decay Techniques using {4T} Quasi-Static Memory Cells", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "1", number = "1", pages = "10--10", month = jan, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2002.5", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper proposes the use of four-transistor (4T) cache and branch predictor array cell designs to address increasing worries regarding leakage power dissipation. While 4T designs lose state when infrequently accessed, they have very low leakage, smaller area, and no capacitive loads to switch. This short paper gives an overview of 4T implementation issues and a preliminary evaluation of leakage-energy savings that shows improvements of 60-80\%", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Circuit simulation; Delay; Leakage current; Libraries; Microarchitecture; Power dissipation; Power generation; Random access memory; Switches; Transistors", } @Article{Sohn:2002:RRE, author = "YoungChul Sohn and NaiHoon Jung and Seungryoul Maeng", title = "Request Reordering to Enhance the Performance of Strict Consistency Models", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "1", number = "1", pages = "11--11", month = jan, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2002.11", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Advances in ILP techniques enable strict consistency models to relax memory order through speculative execution of memory operations. However, ordering constraints still hinder the performance because speculatively executed operations cannot be committed out of program order for the possibility of mis-speculation. In this paper, we propose a new technique which allows memory operations to be non-speculatively committed out of order without violating consistency constraints.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "ILP; memory consistency model; multiprocessor", } @Article{Shaw:2002:MSC, author = "K. A. Shaw and W. J. Dally", title = "Migration in Single Chip Multiprocessors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "1", number = "1", pages = "12--12", month = jan, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2002.7", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Global communication costs in future single-chip multiprocessors will increase linearly with distance. In this paper, we revisit the issues of locality and load balance in order to take advantage of these new costs. We present a technique which simultaneously migrates data and threads based on vectors specifying locality and resource usage. This technique improves performance on applications with distinguishable locality and imbalanced resource usage. 64\% of the ideal reduction in execution time was achieved on an application with these traits while no improvement was obtained on a balanced application with little locality.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Cost function; Delay; Global communication; Laboratories; Logic; Monitoring; Multiprocessing systems; Wire", } @Article{Sihn:2003:SCS, author = "K.-H. Sihn and Joonwon Lee and Jung-Wan Cho", title = "A Speculative Coherence Scheme using Decoupling Synchronization for Multiprocessor Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "2", number = "1", pages = "1--1", month = jan, year = "2003", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2003.1", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper proposes a new speculative coherence scheme, SCDS, for hardware distributed shared memory systems to reduce the overhead of coherence action in directory-based cache-coherence protocol. SCDS has two main features, predicting accurate timing of speculative coherence with synchronization information and detecting write pattern(migratory and non-migratory) for exclusive blocks' speculative coherence action. In our simulation, SCDS outperforms existing schemes (DSI and LTP) for well-synchronized applications.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Access protocols; Coherence; Costs; Delay; Hardware; Multiprocessing systems; Personal communication networks; Runtime; Timing; Watches", } @Article{Kumar:2003:PPR, author = "R. Kumar and K. Farkas and N. P. Jouppi and P. Ranganathan and D. M. Tullsen", title = "Processor Power Reduction Via Single-{ISA} Heterogeneous Multi-Core Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "2", number = "1", pages = "2--2", month = jan, year = "2003", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2003.6", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper proposes a single-ISA heterogeneous multi-core architecture as a mechanism to reduce processor power dissipation. It assumes a single chip containing a diverse set of cores that target different performance levels and consume different levels of power. During an application's execution, system software dynamically chooses the most appropriate core to meet specific performance and power requirements. It describes an example architecture with five cores of varying performance and complexity. Initial results demonstrate a five-fold reduction in energy at a cost of only 25\% performance.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Application software; chip multiprocessor; Computer architecture; Computer science; Costs; Energy consumption; Fans; low-power architecture; Packaging; Power dissipation; Power engineering and energy; System software", } @Article{Sendag:2003:ACE, author = "R. Sendag and Peng-fei Chuang and D. J. Lilja", title = "Address Correlation: Exceeding the Limits of Locality", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "2", number = "1", pages = "3--3", month = jan, year = "2003", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2003.3", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We investigate a program phenomenon, Address Correlation, which links addresses that reference the same data. This work shows that different addresses containing the same data can often be correlated at run-time to eliminate a load miss or a partial hit. For ten of the SPEC CPU2000 benchmarks, 57 to 99\% of all L1 data cache load misses, and 4 to 85\% of all partial hits, can be supplied from a correlated address already found in the cache. Our source code-level analysis shows that semantically equivalent information, duplicated references, and frequent values are the major causes of address correlations. We also show that, on average, 68\% of the potential correlated addresses that could supply data on a miss of an address containing the same value can be correlated at run time. These correlated addresses correspond to an average of 62\% of all misses in the benchmark programs tested.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; Delay; Electronic mail; Hardware; History; Microarchitecture; Object oriented modeling; Out of order; Runtime; Tellurium", } @Article{Milenkovic:2003:SBT, author = "A. Milenkovic and M. Milenkovic", title = "Stream-Based Trace Compression", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "2", number = "1", pages = "4--4", month = jan, year = "2003", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2003.7", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Trace-driven simulation has long been used in both processor and memory studies. The large size of traces motivated different techniques for trace reduction. These techniques often combine standard compression algorithms with trace-specific solutions, taking into account the tradeoff between reduction in the trace size and simulation slowdown due to decompression. This paper introduces SBC, a new algorithm for instruction and data address trace compression based on instruction streams. The proposed technique significantly reduces trace size and simulation time, and it is orthogonal to general compression algorithms. When combined with gzip, SBC reduces the size of SPEC CPU2000 traces 94-71968 times.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Clocks; Compression algorithms; Computational modeling; Computer architecture; Computer simulation; Data mining; Information analysis; instruction and address trace; Instruments; Predictive models; Redundancy; simulation; trace compression", } @Article{Zhang:2003:WHC, author = "Chuanjun Zhang and F. Vahid and Jun Yang and W. Walid", title = "A Way-Halting Cache for Low-Energy High-Performance Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "2", number = "1", pages = "5--5", month = jan, year = "2003", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2003.2", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We have designed a low power four-way set associative cache that stores the four lowest-order bits of all way stags into a fully associative memory, which we call the halt tag array. The comparison of the halt tag array with the desired tag occurs concurrently with the address decoding that determines which tag and data ways to read from. The halt tag array predetermines most tags that cannot match due to their low-order four bits mismatching. Further accesses to ways with known mismatching tags are then halted, thus saving power. Our halt tag array has the additional feature of using static logic only, rather than dynamic logic used in highly-associative caches, making our cache consumes even less power. Our result shows55\% savings of memory access related energy over a conventional four-way set-associative cache. We show nearly 2x energy savings compared with highly associative caches, while imposing no performance overhead and only 2\% cache area over head.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Cams; Circuits; Computer science; Decoding; Design engineering; Embedded computing; Logic arrays; Power engineering and energy; Power engineering computing; Switches", } @Article{Cohen:2003:EOP, author = "A. Cohen and F. Finkelstein and A. Mendelson and R. Ronen and D. Rudoy", title = "On Estimating Optimal Performance of {CPU} Dynamic Thermal Management", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "2", number = "1", pages = "6--6", month = jan, year = "2003", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2003.5", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "In this paper we focus on dynamic thermal management (DTM) strategies that use dynamic voltage scaling (DVS)for power control. We perform a theoretical analysis targeted at estimating the optimal strategy, and show two facts: (1) when there is a gap between the initial and the limit temperatures, it is best to start with a high (though not necessarily maximal)frequency and decrease it exponentially until the limit temperature is reached; (2) when being close to the limit temperature, the best strategy is to stay there. We use the patterns exhibited by the optimal strategy in order to analyze some existing DTM techniques.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Costs; DTM; DVS; Energy management; Frequency estimation; Microprocessors; optimal control; Pattern analysis; Performance analysis; Temperature control; Temperature sensors; Thermal management; Voltage control", } @Article{Cristal:2003:CRC, author = "A. Cristal and J. F. Martinez and J. Llosa and M. Valero", title = "A case for resource-conscious out-of-order processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "2", number = "1", pages = "7--7", month = jan, year = "2003", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2003.4", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Modern out-of-order processors tolerate long-latency memory operations by supporting a large number of in-flight instructions. This is achieved in part through proper sizing of critical resources, such as register files or instruction queues. In light of the increasing gap between processor speed and memory latency, tolerating upcoming latencies in this way would require impractical sizes of such critical resources. To tackle this scalability problem, we make a case for resource-conscious out-of-order processors. We present quantitative evidence that critical resources are increasingly underutilized in these processors. We advocate that better use of such resources should be a priority in future research in processor architectures.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bars; checkpointing.; Computer aided instruction; Delay; instruction-level parallelism; Laboratories; memory latency; Optimal control; Out of order; Out-of-order processor; Queueing analysis; Registers; Resource management; resource utilization; Voltage control", } @Article{Citron:2004:ELE, author = "D. Citron", title = "Exploiting Low Entropy to Reduce Wire Delay", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "3", number = "1", pages = "1--1", month = jan, year = "2004", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2004.7", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Wires shrink less efficiently than transistors. Smaller dimensions increase relative delay and the probability of crosstalk. Solutions to this problem include adding additional latency with pipelining, using ``fat wires'' at higher metal levels, and advances in process and material technology. We propose a stopgap solution to this problem by applying a decade old technique called bus-expanding to the problem. By exploiting low spatial and temporal entropy of data it is possible to transfer m bits of data over a n-bit wide bus in a single cycle (m > n ). High entropy data will be routed directly over the bus while low entropy data will be compacted using small lookup tables. A table index will be transferred in the case of a successful lookup, otherwise the full value will be transferred in several cycles. Reducing the number of wires per bus, enables the use of wider wires, which in turn reduces the wire delay. Examination of projected process technologies shows that by shrinking the number of bits in a bus (64 > 48) instead of shrinking the individual wires maintains a constant wire delay. Tests on SPEC CPU2000 have shown that for the 64-bit buses leading from the L1 caches to the processor core it is possible to transfer all data types (addresses, integers, instructions and floating-points) using 40-bits per bus on the average.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Area measurement; Compaction; Crosstalk; Delay; Entropy; Materials science and technology; Pipeline processing; Power measurement; Transistors; Wire", } @Article{Singh:2004:GAL, author = "A. Singh and W. J. Dally and B. Towles and A. K. Gupta", title = "Globally Adaptive Load-Balanced Routing on Tori", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "3", number = "1", pages = "2--2", month = jan, year = "2004", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2004.8", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We introduce a new method of adaptive routing on k-ary n-cubes, Globally Adaptive Load-Balance (GAL). GAL makes global routing decisions using global information. In contrast, most previous adaptive routing algorithms make local routing decisions using local information (typically channel queue depth). GAL senses global congestion using segmented injection queues to decide the directions to route in each dimension. It further load balances the network by routing in the selected directions adaptively. Using global information, GAL achieves the performance (latency and throughput) of minimal adaptive routing on benign traffic patterns and performs as well as the best obliviously load-balanced routing algorithm (GOAL) on adversarial traffic.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Chaos; Delay; Nearest neighbor searches; Routing; Stability; Switches; Telecommunication traffic; Throughput; Tornadoes; Traffic control", } @Article{Gomez:2004:EFT, author = "M. E. Gomez and J. Duato and J. Flich and P. Lopez and A. Robles and N. A. Nordbotten and O. Lysne and T. Skeie", title = "An Efficient Fault-Tolerant Routing Methodology for Meshes and Tori", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "3", number = "1", pages = "3--3", month = jan, year = "2004", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2004.1", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "In this paper we present a methodology to design fault-tolerant routing algorithms for regular direct interconnection networks. It supports fully adaptive routing, does not degrade performance in the absence of faults, and supports a reasonably large number of faults without significantly degrading performance. The methodology is mainly based on the selection of an intermediate node (if needed) for each source-destination pair. Packets are adaptively routed to the intermediate node and, at this node, without being ejected, they are adaptively forwarded to their destinations. In order to allow deadlock-free minimal adaptive routing, the methodology requires only one additional virtual channel (for a total of three), even for tori. Evaluation results for a 4 x 4 x 4 torus network show that the methodology is 5-fault tolerant. Indeed, for up to 14 link failures, the percentage of fault combinations supported is higher than 99.96\%. Additionally, network throughput degrades by less than 10\% when injecting three random link faults without disabling any node. In contrast, a mechanism similar to the one proposed in the BlueGene/L, that disables some network planes, would strongly degrade network throughput by 79\%.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Algorithm design and analysis; Circuit faults; Degradation; Design methodology; Electronic mail; Fault tolerance; Multiprocessor interconnection networks; Routing; Switches; Throughput", } @Article{Stine:2004:CAR, author = "J. M. Stine and N. P. Carter and J. Flich", title = "Comparing Adaptive Routing and Dynamic Voltage Scaling for Link Power Reduction", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "3", number = "1", pages = "4--4", month = jan, year = "2004", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2004.5", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We compare techniques that dynamically scale the voltage of individual network links to reduce power consumption with an approach in which all links in the network are set to the same voltage and adaptive routing is used to distribute load across the network. Our results show that adaptive routing with static network link voltages outperforms dimension-order routing with dynamic link voltages in all cases, because the adaptive routing scheme can respond more quickly to changes in network demand. Adaptive routing with static link voltages also outperforms adaptive routing with dynamic link voltages in many cases, although dynamic link voltage scaling gives better behavior as the demand on the network grows.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Clocks; Dynamic voltage scaling; Energy consumption; Frequency; Network-on-a-chip; Routing; Telecommunication traffic; Traffic control; Voltage control", } @Article{Robatmili:2004:TSI, author = "B. Robatmili and N. Yazdani and S. Sardashti and M. Nourani", title = "Thread-Sensitive Instruction Issue for {SMT} Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "3", number = "1", pages = "5--5", month = jan, year = "2004", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2004.9", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Simultaneous Multi Threading (SMT) is a processor design method in which concurrent hardware threads share processor resources like functional units and memory. The scheduling complexity and performance of an SMT processor depend on the topology used in the fetch and issue stages. In this paper, we propose a thread sensitive issue policy for a partitioned SMT processor which is based on a thread metric. We propose the number of ready-to-issue instructions of each thread as priority metric. To evaluate our method, we have developed a reconfigurable SMT-simulator on top of the SimpleScalar Toolset. We simulated our modeled processor under several workloads composed of SPEC benchmarks. Experimental results show around 30\% improvement compared to the conventional OLDEST\_FIRST mixed topology issue policy. Additionally, the hardware implementation of our architecture with this metric in issue stage is quite simple.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Clocks; Delay; Frequency; Intrusion detection; Laboratories; Logic; Processor scheduling; Surface-mount technology; Topology", } @Article{Luo:2004:EES, author = "Yue Luo and L. K. John", title = "Efficiently Evaluating Speedup Using Sampled Processor Simulation", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "3", number = "1", pages = "6--6", month = jan, year = "2004", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2004.6", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Cycle accurate simulation of processors is extremely time consuming. Sampling can greatly reduce simulation time while retaining good accuracy. Previous research on sampled simulation has been focusing on the accuracy of CPI. However, most simulations are used to evaluate the benefit of some microarchitectural enhancement, in which the speedup is a more important metric than CPI. We employ the ratio estimator from statistical sampling theory to design efficient sampling to measure speedup and to quantify its error. We show that to achieve a given relative error limit for speedup, it is not necessary to estimate CPI to the same accuracy. In our experiment, estimating speedup requires about 9X fewer instructions to be simulated in detail in comparison to estimating CPI for the same relative error limit. Therefore using the ratio estimator to evaluate speedup is much more cost-effective and offers great potential for reducing simulation time. We also discuss the reason for this interesting and important result.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Application software; Clocks; Computational modeling; Computer errors; Computer simulation; Frequency; Microarchitecture; Sampling methods; Size measurement; Velocity measurement", } @Article{Ceze:2004:CHL, author = "L. Ceze and K. Strauss and J. Tuck and J. Renau and J. Torrellas", title = "{CAVA}: Hiding {L2} Misses with Checkpoint-Assisted Value Prediction", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "3", number = "1", pages = "7--7", month = jan, year = "2004", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2004.3", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Load misses in on-chip L2 caches often end up stalling modern superscalars. To address this problem, we propose hiding L2 misses with Checkpoint-Assisted VAlue prediction (CAVA). When a load misses in L2, a predicted value is returned to the processor. If the missing load reaches the head of the reorder buffer before the requested data is received from memory, the processor checkpoints, consumes the predicted value, and speculatively continues execution. When the requested data finally arrives, it is compared to the predicted value. If the prediction was correct, execution continues normally; otherwise, execution rolls back to the checkpoint. Compared to a baseline aggressive superscalar, CAVA speeds up execution by a geometric mean of 1.14 for SPECint and 1.34 for SPECfp applications. Additionally, CAVA is faster than an implementation of Runahead execution, and Runahead with value prediction.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Application software; Checkpointing; Costs; Delay; Hardware; Microarchitecture; Out of order; Pipelines; Prefetching; Recycling", } @Article{Singh:2004:BDB, author = "A. Singh and W. J. Dally", title = "Buffer and Delay Bounds in High Radix Interconnection Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "3", number = "1", pages = "8--8", month = jan, year = "2004", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2004.2", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We apply recent results in queueing theory to propose a methodology for bounding the buffer depth and packet delay in high radix interconnection networks. While most work in interconnection networks has been focused on the throughput and average latency in such systems, few studies have been done providing statistical guarantees for buffer depth and packet delays. These parameters are key in the design and performance of a network. We present a methodology for calculating such bounds for a practical high radix network and through extensive simulations show its effectiveness for both bursty and non-bursty injection traffic. Our results suggest that modest speedups and buffer depths enable reliable networks without flow control to be constructed.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Convergence; Delay; Intelligent networks; Multiprocessor interconnection networks; Queueing analysis; Supercomputers; Switches; Telecommunication traffic; Throughput; Traffic control", } @Article{Holloway:2004:CPS, author = "A. L. Holloway and G. S. Sohi", title = "Characterization of Problem Stores", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "3", number = "1", pages = "9--9", month = jan, year = "2004", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2004.4", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper introduces the concept of problem stores: static stores whose dependent loads often miss in the cache. Accurately identifying problem stores allows the early determination of addresses likely to cause later misses, potentially allowing for the development of novel, proactive prefetching and memory hierarchy management schemes. We present a detailed empirical characterization of problem stores using the SPEC2000 CPU benchmarks. The data suggests several key observations about problem stores. First, we find that the number of important problem stores is typically quite small; the worst 100 problem stores write the values that will lead to about 90\% of non-cold misses for a variety of cache configurations. We also find that problem stores only account for 1 in 8 dynamic stores, though they result in 9 of 10 misses. Additionally, the problem stores dependent loads miss in the L2 cache a larger fraction of the time than loads not dependent on problem stores. We also observe the set of problem stores is stable across a variety of cache configurations. Finally, we found that the instruction distance from problem store to miss and problem store to evict is often greater than one million instructions, but the value is often needed within 100,000 instructions of the eviction.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Delay; Hardware; Memory management; Prefetching; Proposals; Timing", } @Article{Sazeides:2005:DIB, author = "Y. Sazeides and R. Kumar and D. M. Tullsen and T. Constantinou", title = "The Danger of Interval-Based Power Efficiency Metrics: When Worst Is Best", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "4", number = "1", pages = "1--1", month = jan, year = "2005", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2005.2", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper shows that if the execution of a program is divided into distinct intervals, it is possible for one processor or configuration to provide the best power efficiency over every interval, and yet have worse overall power efficiency over the entire execution than other configurations. This unintuitive behavior is a result of a seemingly intuitive use of power efficiency metrics, and can result in suboptimal design and execution decisions. This behavior may occur when using the energy-delay product and energy-delay product metrics but not with the energy metric.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Battery charge measurement; Clocks; Computer science; Delay; Design optimization; Frequency; Out of order; Power engineering and energy; Power measurement", } @Article{Mutlu:2005:RRP, author = "O. Mutlu and Hyesoon Kim and J. Stark and Y. N. Patt", title = "On Reusing the Results of Pre-Executed Instructions in a Runahead Execution Processor", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "4", number = "1", pages = "2--2", month = jan, year = "2005", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2005.1", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Previous research on runahead execution took it for granted as a prefetch-only technique. Even though the results of instructions independent of an L2 miss are correctly computed during runahead mode, previous approaches discarded those results instead of trying to utilize them in normal mode execution. This paper evaluates the effect of reusing the results of preexecuted instructions on performance. We find that, even with an ideal scheme, it is not worthwhile to reuse the results of preexecuted instructions. Our analysis provides insights into why result reuse does not provide significant performance improvement in runahead processors and concludes that runahead execution should be employed as a prefetching mechanism rather than a full-blown prefetching/result-reuse mechanism.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Computational modeling; Computer aided instruction; Delay; Energy consumption; Microprocessors; Performance analysis; Prefetching; Registers", } @Article{Zhang:2006:BIC, author = "Chuanjun Zhang", title = "Balanced instruction cache: reducing conflict misses of direct-mapped caches through balanced subarray accesses", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "1", pages = "2--5", month = jan, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.3", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "It is observed that the limited memory space of direct-mapped caches is not used in balance therefore incurs extra conflict misses. We propose a novel cache organization of a balanced cache, which balances accesses to cache sets at the granularity of cache subarrays. The key technique of the balanced cache is a programmable subarray decoder through which the mapping of memory reference addresses to cache subarrays can be optimized hence conflict misses of direct-mapped caches can be resolved. The experimental results show that the miss rate of balanced cache is lower than that of the same sized two-way set-associative caches on average and can be as low as that of the same sized four-way set-associative caches for particular applications. Compared with previous techniques, the balanced cache requires only one cycle to access all cache hits and has the same access time as direct-mapped caches", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "balanced instruction cache; balanced subarray accesses; Bridges; Cache memory; cache organization; cache storage; Clocks; conflict miss reduction; Decoding; Delay; Frequency; High performance computing; programmable subarray decoder; storage allocation", } @Article{Ottoni:2006:SPC, author = "G. Ottoni and R. Rangan and A. Stoler and M. J. Bridges and D. I. August", title = "From sequential programs to concurrent threads", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "1", pages = "6--9", month = jan, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.5", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Chip multiprocessors are of increasing importance due to difficulties in achieving higher clock frequencies in uniprocessors, but their success depends on finding useful work for the processor cores. This paper addresses this challenge by presenting a simple compiler approach that extracts non-speculative thread-level parallelism from sequential codes. We present initial results from this technique targeting a validated dual-core processor model, achieving speedups ranging from 9-48\% with an average of 25\% for important benchmark loops over their single-threaded versions. We also identify important next steps found during our pursuit of higher degrees of automatic threading", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "automatic threading; Bridges; Clocks; Computer science; concurrency control; concurrent threads; Frequency; Hardware; Microprocessors; multi-threading; nonspeculative thread-level parallelism; Parallel processing; Pipeline processing; program compiler; program compilers; Program processors; sequential programs", } @Article{Gupta:2006:TOI, author = "A. K. Gupta and W. J. Dally", title = "Topology optimization of interconnection networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "1", pages = "10--13", month = jan, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.8", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper describes an automatic optimization tool that searches a family of network topologies to select the topology that best achieves a specified set of design goals while satisfying specified packaging constraints. Our tool uses a model of signaling technology that relates bandwidth, cost and distance of links. This model captures the distance-dependent bandwidth of modern high-speed electrical links and the cost differential between electrical and optical links. Using our optimization tool, we explore the design space of hybrid Clos-torus (C-T) networks. For a representative set of packaging constraints we determine the optimal hybrid C-T topology to minimize cost and the optimal C-T topology to minimize latency for various packet lengths. We then use the tool to measure the sensitivity of the optimal topology to several important packaging constraints such as pin count and critical distance", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Constraint optimization; Costs; Design optimization; hybrid Clos-torus networks; interconnection networks; Multiprocessor interconnection networks; multistage interconnection networks; Network topology; Optical fiber communication; Packaging; signaling technology; signalling; Space exploration; Space technology; telecommunication network topology; topology optimization tool", } @Article{Gaudiot:2006:F, author = "J.-L. Gaudiot and Y. Patt and K. Skadon", title = "Foreword", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "1", pages = "11--11", month = jan, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.11", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Forward for issue 1 of 2006", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; Computer Society; Concrete; Delay; Footwear; Software libraries; Vehicles", } @Article{Morad:2006:PPE, author = "T. Y. Morad and U. C. Weiser and A. Kolodnyt and M. Valero and E. Ayguade", title = "Performance, power efficiency and scalability of asymmetric cluster chip multiprocessors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "1", pages = "14--17", month = jan, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.6", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper evaluates asymmetric cluster chip multiprocessor (ACCMP) architectures as a mechanism to achieve the highest performance for a given power budget. ACCMPs execute serial phases of multithreaded programs on large high-performance cores whereas parallel phases are executed on a mix of large and many small simple cores. Theoretical analysis reveals a performance upper bound for symmetric multiprocessors, which is surpassed by asymmetric configurations at certain power ranges. Our emulations show that asymmetric multiprocessors can reduce power consumption by more than two thirds with similar performance compared to symmetric multiprocessors", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "ACCMP; Application software; asymmetric cluster chip multiprocessors; Chip Multiprocessors; Emulation; Frequency; microprocessor chips; multi-threading; multiprocessing systems; multithreaded program; Optimized production technology; Parallel processing; parallel processing; power consumption reduction; power efficiency; Power Efficiency; Power system modeling; Queueing analysis; Scalability; Upper bound; Voltage", } @Article{Riley:2006:PCU, author = "N. Riley and C. Zilles", title = "Probabilistic counter updates for predictor hysteresis and bias", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "1", pages = "18--21", month = jan, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.7", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Hardware predictor designers have incorporated hysteresis and/or bias to achieve desired behavior by increasing the number of bits per counter. Some resulting proposed predictor designs are currently impractical because their counter tables are too large. We describe a method for dramatically reducing the amount of storage required for a predictor's counter table with minimal impact on prediction accuracy. Probabilistic updates to counter state are implemented using a hardware pseudo-random number generator to increment or decrement counters a fraction of the time, meaning fewer counter bits are required. We demonstrate the effectiveness of probabilistic updates in the context of Fields et al.'s critical path predictor, which employs a biased 6-bit counter. Averaged across the SPEC CINT2000 benchmarks, our 2-bit and 3-bit probabilistic counters closely approximate a 6-bit deterministic one (achieving speedups of 7.75\% and 7.91\% compared to 7.94\%) when used for criticality-based scheduling in a clustered machine. Performance degrades gracefully, enabling even a 1-bit probabilistic counter to outperform the best 3-bit deterministic counter we found", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Accuracy; clustered machine; computer architecture; Computer science; Costs; Counting circuits; critical path predictor; criticality-based scheduling; Degradation; Hardware; hardware predictor design; hardware pseudorandom number generator; Hysteresis; Microarchitecture; Pipelines; predictor bias; predictor hysteresis; predictors counter table; probabilistic counter update; probability; Processor scheduling; processor scheduling; random number generation", } @Article{Zhou:2006:CFT, author = "Huiyang Zhou", title = "A case for fault tolerance and performance enhancement using chip multi-processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "1", pages = "22--25", month = jan, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.1", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper makes a case for using multi-core processors to simultaneously achieve transient-fault tolerance and performance enhancement. Our approach is extended from a recent latency-tolerance proposal, dual-core execution (DCE). In DCE, a program is executed twice in two processors, named the front and back processors. The front processor pre-processes instructions in a very fast yet highly accurate way and the back processor re-executes the instruction stream retired from the front processor. The front processor runs faster as it has no correctness constraints whereas its results, including timely prefetching and prompt branch misprediction resolution, help the back processor make faster progress. In this paper, we propose to entrust the speculative results of the front processor and use them to check the un-speculative results of the back processor. A discrepancy, either due to a transient fault or a mispeculation, is then handled with the existing mispeculation recovery mechanism. In this way, both transient-fault tolerance and performance improvement can be delivered simultaneously with little hardware overhead", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "back processor; chip multiprocessors; Computer aided software engineering; dual-core execution; Error analysis; Fault tolerance; fault tolerant computing; front processor; Hardware; latency-tolerance proposal; microprocessor chips; mispeculation recovery mechanism; Multicore processing; multiprocessing systems; prefetching; Prefetching; prompt branch misprediction resolution; Proposals; Redundancy; storage management; Throughput; transient-fault tolerance; Transistors", } @Article{Lee:2006:ASC, author = "Moon-Sang Lee and Sang-Kwon Lee and Joonwon Lee and Seung-Ryoul Maeng", title = "Adopting system call based address translation into user-level communication", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "1", pages = "26--29", month = jan, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.2", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "User-level communication alleviates the software overhead of the communication subsystem by allowing applications to access the network interface directly. For that purpose, efficient address translation of virtual address to physical address is critical. In this study, we propose a system call based address translation scheme where every translation is done by the kernel instead of a translation cache on a network interface controller as in the previous cache based address translation. According to our experiments, our scheme achieves up to 4.5\% reduction in application execution time compared to the previous cache based approach", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Application software; cache based approach; cache storage; Communication system software; Control systems; Costs; Delay; Electronic mail; Hardware; Kernel; network interface controller; network interfaces; Network interfaces; operating system kernels; Protocols; software overhead; system call based address translation; user-level communication", } @Article{Ahn:2006:DPA, author = "Jung Ho Ahn and W. J. Dally", title = "Data parallel address architecture", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "1", pages = "30--33", month = jan, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.4", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Data parallel memory systems must maintain a large number of outstanding memory references to fully use increasing DRAM bandwidth in the presence of increasing latency. At the same time, the throughput of modern DRAMs is very sensitive to access pattern's due to the time required to precharge and activate banks and to switch between read and write access. To achieve memory reference parallelism a system may simultaneously issue references from multiple reference threads. Alternatively multiple references from a single thread can be issued in parallel. In this paper, we examine this tradeoff and show that allowing only a single thread to access DRAM at any given time significantly improves performance by increasing the locality of the reference stream and hence reducing precharge/activate operations and read/write turnaround. Simulations of scientific and multimedia applications show that generating multiple references from a single thread gives, on average, 17\% better performance than generating references from two parallel threads", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Computer architecture; data parallel address architecture; data parallel memory systems; Delay; DRAM bandwidth; DRAM chips; Memory management; parallel architectures; parallel memories; Parallel processing; Random access memory; read access; Scheduling; Streaming media; Switches; write access", } @Article{Eisley:2006:NCC, author = "N. Eisley and Li-Shiuan Peh and Li Shang", title = "In-network cache coherence", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "1", pages = "34--37", month = jan, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.9", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We propose implementing cache coherence protocols within the network, demonstrating how an in-network implementation of the MSI directory-based protocol allows for in-transit optimizations of read and write delay. Our results show 15\% and 24\% savings on average in memory access latency for SPLASH-2 parallel benchmarks running on a 4times4 and a 16times16 multiprocessor respectively", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Access protocols; benchmark testing; cache coherence; cache storage; Coherence; Delay; delays; Fabrics; interconnection network; memory access latency; Memory architecture; memory architecture; memory protocols; Moore's Law; MSI directory-based protocol; Multiprocessor interconnection networks; network cache coherence protocols; parallel processing; read delay; SPLASH-2 parallel benchmarks; write delay", } @Article{Srinivasan:2006:PMU, author = "R. Srinivasan and J. Cook and O. Lubeck", title = "Performance modeling using {Monte Carlo} simulation", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "1", pages = "38--41", month = jan, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.10", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/intel-ia-64.bib", abstract = "Cycle accurate simulation has long been the primary tool for micro-architecture design and evaluation. Though accurate, the slow speed often imposes constraints on the extent of design exploration. In this work, we propose a fast, accurate Monte-Carlo based model for predicting processor performance. We apply this technique to predict the CPI of in-order architectures and validate it against the Itanium-2. The Monte Carlo model uses micro-architecture independent application characteristics, and cache, branch predictor statistics to predict CPI with an average error of less than 7\%. Since prediction is achieved in a few seconds, the model can be used for fast design space exploration that can efficiently cull the space for cycle-accurate simulations. Besides accurately predicting CPI, the model also breaks down CPI into various components, where each component quantifies the effect of a particular stall condition (branch misprediction, cache miss, etc.) on overall CPI. Such a CPI decomposition can help processor designers quickly identify and resolve critical performance bottlenecks", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "branch predictor statistics; Computational modeling; Computer architecture; CPI decomposition; design space exploration; Error analysis; Itanium-2; Laboratories; Mathematical analysis; memory architecture; microarchitecture design; microarchitecture evaluation; Monte Carlo methods; Monte Carlo simulation; performance evaluation; Predictive models; Process design; processor performance modeling; program processors; Sampling methods; Space exploration", } @Article{Ergin:2006:ENV, author = "O. Ergin and O. Unsal and X. Vera and A. Gonzalez", title = "Exploiting Narrow Values for Soft Error Tolerance", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "2", pages = "12--12", month = feb, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.12", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Soft errors are an important challenge in contemporary microprocessors. Particle hits on the components of a processor are expected to create an increasing number of transient errors with each new microprocessor generation. In this paper we propose simple mechanisms that effectively reduce the vulnerability to soft errors In a processor. Our designs are generally motivated by the fact that many of the produced and consumed values in the processors are narrow and their upper order bits are meaningless. Soft errors canted by any particle strike to these higher order bits can be avoided by simply identifying these narrow values. Alternatively soft errors can be detected or corrected on the narrow values by replicating the vulnerable portion of the value inside the storage space provided for the upper order bits of these operands. We offer a variety of schemes that make use of narrow values and analyze their efficiency in reducing soft error vulnerability of level-1 data cache of the processor", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache storage; Cache storage; contemporary microprocessors; data cache; Data Cache; Error correction; error correction; Error Correction; error correction; error detection; Hardware; Impurities; Manufacturing; microprocessor chips; Microprocessors; Multithreading; Narrow Values; narrow values; Neutrons; particle strike; Process design; radiation effects; Random access memory; soft error tolerance; Soft Errors; system recovery; transient errors; transients", } @Article{Li:2006:PBH, author = "W. Li and S. Mohanty and K. Kavi", title = "A Page-based Hybrid (Software--Hardware) Dynamic Memory Allocator", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "2", pages = "13--13", month = feb, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.13", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/java2000.bib", abstract = "Modern programming languages often include complex mechanisms for dynamic memory allocation and garbage collection. These features drive the need for more efficient implementation of memory management functions, both in terms of memory usage and execution performance. In this paper, we introduce a software and hardware co-design to improve the speed of the software allocator used in free-BSD systems. The hardware complexity of our design is independent of the dynamic memory size, thus making the allocator suitable for any memory size. Our design improves the performance of memory management intensive benchmarks by as much as 43\%. To oar knowledge, this is the first-ever work of this kind, introducing ``hybrid memory allocator''", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Application software; Computer languages; Computer science; Costs; Delay; Dynamic programming; garbage collection; Hardware; hardware complexity; hardware-software codesign; hybrid dynamic memory allocator; Java; memory allocator; memory architecture; memory management; Memory management; modern programming languages; software allocator; Software performance; software-hardware co-design; software/hardware co-design; storage allocation; storage management", } @Article{Donald:2006:EPP, author = "J. Donald and M. Martonosi", title = "An Efficient, Practical Parallelization Methodology for Multicore Architecture Simulation", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "2", pages = "14--14", month = feb, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.14", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Multiple core designs have become commonplace in the processor market, and are hence a major focus in modern computer architecture research. Thus, for both product development and research, multiple core processor simulation environments are necessary. A well-known positive feedback property of computer design is that we use today's computers to design tomorrow's. Thus, with the emergence of chip multiprocessors, it is natural to re-examine simulation environments written to exploit parallelism. In this paper we present a programming methodology for directly converting existing uniprocessor simulators into parallelized multiple-core simulators. Our method not only takes significantly less development effort compared to some prior used programming techniques, but also possesses advantages by retaining a modular and comprehensible programming structure. We demonstrate our case with actual developed products after applying this method to two different simulators, one developed from IBM Ibrandot and the other from the SimpleScalar tool set. Our SimpleScalar-based framework achieves a parallel speedup of 2.2times on a dual-CPU dual-core (4-way) Opteron server", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "chip multiprocessors; comprehensible programming structure; Computational modeling; Computer architecture; Computer simulation; Feedback; IBM Ibrandot; logic simulation; microcomputers; modern computer architecture; modular programming structure; multicore; multicore architecture simulation; Multicore processing; multiple core processor simulation; multiprocessing systems; Object oriented modeling; parallel architectures; Parallel processing; Parallel programming; parallelism; parallelization method; parallelized multiple-core simulators; positive feedback property; Process planning; Product development; programming methodology; SimpleScalar tool set; simulation", } @Article{Bracy:2006:DAC, author = "A. Bracy and K. Doshi and Q. Jacobson", title = "Disintermediated Active Communication", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "2", pages = "15--15", month = feb, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.15", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Disintermediated active communication (DAC) is a new paradigm of communication in which a sending thread actively engages a receiving thread when sending it a message via shared memory. DAC is different than existing approaches that use passive communication through shared-memory --- based on intermittently checking for messages --- or that use preemptive communication but must rely on intermediaries such as the operating system or dedicated interrupt channels. An implementation of DAC builds on existing cache coherency support and exploits light-weight user-level interrupts. Inter-thread communication occurs via monitored memory locations where the receiver thread responds to invalidations of monitored addresses with a light-weight user-level software-defined handler. Address monitoring is supported by cache line user-bits, or CLUbits. CLUbits reside in the cache next to the coherence state, are private per thread, and maintain user-defined per-cache-line state. A light weight software library can demultiplex asynchronous notifications and handle exceptional cases. In DAC-based programs threads coordinate with one another by explicit signaling and implicit resource monitoring. With the simple and direct communication primitives of DAC, multi-threaded workloads synchronize at a finer granularity and more efficiently utilize the hardware of upcoming multi-core designs. This paper introduces DAC, presents several signaling models for DAC-based programs, and describes a simple memory-based framework that supports DAC by leveraging existing cache-coherency models. Our framework is general enough to support uses beyond DAC", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "address monitoring; cache coherency; cache line user-bits; cache storage; CLUbits; Computer aided instruction; Concurrent computing; disintermediated active communication; Hardware; High performance computing; interrupts; interthread communication; memory locations; Monitoring; multi-threading; multicore designs; Operating systems; Processor scheduling; Programming profession; resource monitoring; shared memory; shared memory systems; signaling models; software libraries; Software libraries; software library; storage allocation; user-level interrupts", } @Article{Mallik:2006:UDF, author = "A. Mallik and B. Lin and G. Memik and P. Dinda and R. P. Dick", title = "User-Driven Frequency Scaling", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "2", pages = "16--16", month = feb, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.16", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We propose and evaluate user-driven frequency scaling (UDFS) for improved power management on processors that support dynamic voltage and frequency scaling (DVFS), e.g, those used in current laptop and desktop computers. UDFS dynamically adapts CPU frequency to the individual user and the workload through a simple user feedback mechanism, unlike currently-used DVFS methods which rely only on CPU utilization. Our UDFS algorithms dramatically reduce typical operating frequencies while maintaining performance at satisfactory levels for each user. We evaluated our techniques through user studies conducted on a Pentium M laptop running Windows applications. The UDFS scheme reduces measured system power by 22.1\%, averaged across all our users and applications, compared to the Windows XP DVFS scheme", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Central Processing Unit; computer power supplies; CPU frequency; DVFS; dynamic frequency scaling; Dynamic voltage scaling; dynamic voltage scaling; Energy consumption; Energy management; Engineering management; Feedback; Frequency control; improved power management; microprocessor chips; Pentium M laptop; Portable computers; power aware computing; Power engineering computing; Power Management; Power measurement; user feedback mechanism; User-aware computing; user-driven frequency scaling; Windows XP DVFS scheme", } @Article{Blundell:2006:STM, author = "C. Blundell and E. C. Lewis and M. M. K. Martin", title = "Subtleties of transactional memory atomicity semantics", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "2", pages = "17--17", month = feb, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.18", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Transactional memory has great potential for simplifying multithreaded programming by allowing programmers to specify regions of the program that must appear to execute atomically. Transactional memory implementations then optimistically execute these transactions concurrently to obtain high performance. This work shows that the same atomic guarantees that give transactions their power also have unexpected and potentially serious negative effects on programs that were written assuming narrower scopes of atomicity. We make four contributions: (1) we show that a direct translation of lock-based critical sections into transactions can introduce deadlock into otherwise correct programs, (2) we introduce the terms strong atomicity and weak atomicity to describe the interaction of transactional and non-transactional code, (3) we show that code that is correct under weak atomicity can deadlock under strong atomicity, and (4) we demonstrate that sequentially composing transactional code can also introduce deadlocks. These observations invalidate the intuition that transactions are strictly safer than lock-based critical sections, that strong atomicity is strictly safer than weak atomicity, and that transactions are always composable", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer languages; Computer Systems Organization; Concurrent distributed and parallel languages; deadlock; direct translation; Hardware; Information science; Interference; Interleaved codes; Language Classifications; Law; lock-based critical sections; Multi-core/single-chip multiprocessors; multi-threading; Multiple Data Stream Architectures (Multiprocessors); multithreaded programming; nontransactional code; operating systems (computers); Parallel Architectures; Processor Architectures; program verification; Programming Languages; Programming profession; sequentially composing transactional code; Software performance; Software/Software Engineering; strong atomicity; System recovery; Transaction databases; transaction processing; transactional memory atomicity semantics; weak atomicity", } @Article{Price:2006:CCT, author = "G. Price and M. Vachharajani", title = "A Case for Compressing Traces with {BDDs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "2", pages = "18--18", month = feb, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.17", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Instruction-level traces are widely used for program and hardware analysis. However, program traces for just a few seconds of execution are enormous, up to several terabytes in size, uncompressed. Specialized compression can shrink traces to a few gigabytes, but trace analyzers typically stream the decompressed trace through the analysis engine. Thus, the complexity of analysis depends on the decompressed trace size (even though the decompressed trace is never stored to disk). This makes many global or interactive analyses infeasible. This paper presents a method to compress program traces using binary decision diagrams (BDDs). BDDs intrinsically support operations common to many desirable program analyses and these analyses operate directly on the BDD. Thus, they are often polynomial in the size of the compressed representation. The paper presents mechanisms to represent a variety of trace data using BDDs and shows that BDDs can store, in 1 GB of RAM, the entire data-dependence graph of traces with over 1 billion instructions. This allows rapid computation of global analyses such as heap-object liveness and dynamic slicing", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "binary decision diagrams; Binary decision diagrams; Boolean functions; Data analysis; Data structures; data-dependence graph; dynamic slicing; Engines; global analyses; Hardware; hardware analysis; heap-object liveness; instruction-level traces; Performance analysis; Polynomials; program analysis; program slicing; program traces; rapid computation; Read-write memory; Software Engineering; Software Processor validation Engineering; Software/Program Verification; Software/Software; Software/Software Engineering; specialized compression; Testing and Debugging; trace analyzers; traces compression; Tracing; Validation; Visualization", } @Article{MoretoPlanas:2007:EDC, author = "M. {Moreto Planas} and F. Cazorla and A. Ramirez and M. Valero", title = "Explaining Dynamic Cache Partitioning Speed Ups", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "6", number = "1", pages = "1--4", month = jan, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.3", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Cache partitioning has been proposed as an interesting alternative to traditional eviction policies of shared cache levels in modern CMP architectures: throughput is improved at the expense of a reasonable cost. However, these new policies present different behaviors depending on the applications that are running in the architecture. In this paper, we introduce some metrics that characterize applications and allow us to give a clear and simple model to explain final throughput speed ups.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "B Hardware; B.3 Memory Structures; B.3.2 Design Styles; B.3.2.b Cache memories; B.3.3 Performance Analysis and Design Aids; C Computer Systems Organization; C.1 Processor Architectures; C.1.4 Parallel Architectures; C.1.4.e Multi-core/single-chip multiprocessors; C.1.5 Micro-architecture implementation considerations; C.1.5.e Memory hierarchy; C.4 Performance of Systems; C.4.d Modeling techniques; cache storage; chip multiprocessing; Computer architecture; Counting circuits; dynamic cache partitioning; microprocessor chips; Parallel processing; Process design; Resource management; shared cache levels; Streaming media; Surface-mount technology; Throughput; Uninterruptible power systems", } @Article{Jerger:2007:CSC, author = "N. Enright Jerger and M. Lipasti and L. Peh", title = "Circuit-Switched Coherence", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "6", number = "1", pages = "5--8", month = jan, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.2", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Circuit-switched networks can significantly lower the communication latency between processor cores, when compared to packet-switched networks, since once circuits are set up, communication latency approaches pure interconnect delay. However, if circuits are not frequently reused, the long set up time and poorer interconnect utilization can hurt overall performance. To combat this problem, we propose a hybrid router design which intermingles packet-switched flits with circuit-switched flits. Additionally, we co-design a prediction-based coherence protocol that leverages the existence of circuits to optimize pair-wise sharing between cores. The protocol allows pair-wise sharers to communicate directly with each other via circuits and drives up circuit reuse. Circuit-switched coherence provides overall system performance improvements of up to 17\% with an average improvement of 10\% and reduces network latency by up to 30\%.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; C Computer Systems Organization; C.1 Processor Architectures; C.1.4 Parallel Architectures; C.1.4.e Multi-core/single-chip multiprocessors; C.1.4.g On-chip interconnection networks; C.1.5 Micro-architecture implementation considerations; C.1.5.e Memory hierarchy; circuit switching; circuit-switched network; Coupling circuits; Delay; Fabrics; hybrid router design; Integrated circuit interconnections; multiprocessor interconnection networks; network latency; Network-on-a-chip; packet switching; Packet switching; packet switching; pair-wise sharing; Pipelines; prediction-based coherence protocol; processor core; Protocols; routing protocols; System performance", } @Article{Kodakara:2007:CRM, author = "S. Kodakara and J. Kim and D. Lilja and D. Hawkins and W. Hsu and P. Yew", title = "{CIM}: a Reliable Metric for Evaluating Program Phase Classifications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "6", number = "1", pages = "9--12", month = jan, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.4", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We propose the use of the confidence interval of estimated mean (CIM), a metric based on statistical sampling theory, to evaluate the quality of a given phase classification and for comparing different phase classification schemes. Previous research on phase classification used the weighted average of coefficient of variation (CoVwa) to estimate phase classification quality. We found that the phase quality indicated by CoVwa could be inconsistent across different phase classifications. We explain the reasons behind this inconsistency and demonstrate the inconsistency using data from several SPEC CPU2000 benchmark programs. We show that the confidence interval of estimated mean (CIM) correctly estimates the quality of phase classification with a meaningful statistical interpretation.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Acceleration; Benchmark Analysis; Clustering algorithms; Computer architecture; computer architecture; Computer integrated manufacturing; confidence interval; estimated mean; estimation theory; pattern classification; Phase Classification; Phase detection; Phase estimation; Phase measurement; phase quality estimation; program compilers; program diagnostics; program phase classification; Quality Metric; reliable metric; Sampling methods; sampling methods; SPEC CPU2000 benchmark program; statistical interpretation; Statistical Sampling; statistical sampling theory; Statistics; Surges", } @Article{Dieter:2007:LCM, author = "W. R. Dieter and A. Kaveti and H. G. Dietz", title = "Low-Cost Microarchitectural Support for Improved Floating-Point Accuracy", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "6", number = "1", pages = "13--16", month = jan, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.1", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Some processors designed for consumer applications, such as graphics processing units (CPUs) and the CELL processor, promise outstanding floating-point performance for scientific applications at commodity prices. However, IEEE single precision is the most precise floating-point data type these processors directly support in hardware. Pairs of native floating-point numbers can be used to represent a base result and a residual term to increase accuracy, but the resulting order of magnitude slowdown dramatically reduces the price/performance advantage of these systems. By adding a few simple microarchitectural features, acceptable accuracy can be obtained with relatively little performance penalty. To reduce the cost of native-pair arithmetic, a residual register is used to hold information that would normally have been discarded after each floating-point computation. The residual register dramatically simplifies the code, providing both lower latency and better instruction-level parallelism.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Application software; B Hardware; B.2 Arithmetic and Logic Structures; B.2.4 High-Speed Arithmetic; B.2.4.b Cost/performance; C Computer Systems Organization; C.0 General; C.0.b Hardware/software interfaces; C.1 Processor Architectures; C.1.5 Micro-architecture implementation considerations; CELL processor; computer architecture; Costs; floating point arithmetic; floating-point accuracy; Floating-point arithmetic; G Mathematics of Computing; G.1 Numerical Analysis; G.1.0 General; G.1.0.e Multiple precision arithmetic; Graphics; graphics processing units; Hardware; I Computing Methodologies; I.3 Computer Graphics; I.3.1 Hardware Architecture; I.3.1.a Graphics processors; IEEE single precision; instruction-level parallelism; microarchitectural support; Microarchitecture; parallel processing; Pipelines; Registers; Software algorithms; Software performance", } @Article{Etsion:2007:PPT, author = "Y. Etsion and D. G. Feitelson", title = "Probabilistic Prediction of Temporal Locality", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "6", number = "1", pages = "17--20", month = jan, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.5", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The increasing gap between processor and memory speeds, as well as the introduction of multi-core CPUs, have exacerbated the dependency of CPU performance on the memory subsystem. This trend motivates the search for more efficient caching mechanisms, enabling both faster service of frequently used blocks and decreased power consumption. In this paper we describe a novel, random sampling based predictor that can distinguish transient cache insertions from non-transient ones. We show that this predictor can identify a small set of data cache resident blocks that service most of the memory references, thus serving as a building block for new cache designs and block replacement policies. Although we only discuss the L1 data cache, we have found this predictor to be efficient also when handling L1 instruction caches and shared L2 caches.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "B Hardware; B.3 Memory Structures; B.3.2 Design Styles; B.3.2.b Cache memories; B.3.3 Performance Analysis and Design Aids; cache storage; Computer science; Data analysis; data cache; Distributed computing; Energy consumption; Extraterrestrial phenomena; memory subsystem; multi-core CPU; power aware computing; probabilistic prediction; random sampling; Sampling methods; temporal locality; transient cache insertions; Visualization", } @Article{Guz:2007:NCO, author = "Z. Guz and I. Keidar and A. Kolodny and U. Weiser", title = "{Nahalal}: Cache Organization for Chip Multiprocessors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "6", number = "1", pages = "21--24", month = jan, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.6", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper addresses cache organization in chip multiprocessors (CMPs). We show that in CMP systems it is valuable to distinguish between shared data, which is accessed by multiple cores, and private data accessed by a single core. We introduce Nahalal, an architecture whose novel floorplan topology partitions cached data according to its usage (shared versus private data), and thus enables fast access to shared data for all processors while preserving the vicinity of private data to each processor. Nahalal exhibits significant improvements in cache access latency compared to a traditional cache design.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Cache memories; cache organization; cache storage; chip multiprocessors; circuit layout; CMP systems; Computer integrated manufacturing; Computer Systems Organization; Design Styles; floorplan topology partitions; Hardware; Memory Structures; microprocessor chips; Multi-core/single-chip multiprocessors; Nahalal; Parallel Architectures; Processor Architectures; Writing", } @Article{Joao:2007:DPI, author = "J. A. Joao and O. Mutlu and H. Kim and Y. N. Patt", title = "Dynamic Predication of Indirect Jumps", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "6", number = "2", pages = "25--28", month = feb, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.7", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Indirect jumps are used to implement increasingly-common programming language constructs such as virtual function calls, switch-case statements, jump tables, and interface calls. Unfortunately, the prediction accuracy of indirect jumps has remained low because many indirect jumps have multiple targets that are difficult to predict even with specialized hardware. This paper proposes a new way of handling hard-to-predict indirect jumps: dynamically predicating them. The compiler identifies indirect jumps that are suitable for predication along with their control-flow merge (CFM) points. The microarchitecture predicates the instructions between different targets of the jump and its CFM point if the jump turns out to be hard-to-predict at run time. We describe the new indirect jump predication architecture, provide code examples showing why it could reduce the performance impact of jumps, derive an analytical cost-benefit model for deciding which jumps and targets to predicate, and present preliminary evaluation results.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Accuracy; Analytical models; and statically-scheduled implementation; Computer languages; Computer Systems Organization; control-flow merge point; dynamically-scheduled; dynamically-scheduled and statically-scheduled implementation; hard-to-predict indirect jump handling; Hardware; Instruction fetch; Instruction sets; interface call; jump table; Micro-architecture implementation considerations; Microarchitecture; microarchitecture dynamic predication; Object oriented modeling; parallel architectures; Performance analysis; Pipeline processors; Pipelines; Processor Architectures; program compiler; program compilers; program control structures; programming language construct; Single Data Stream Architectures; Superscalar; switch-case statement; Switches; system monitoring; virtual function call", } @Article{Das:2007:MMC, author = "A. Das and S. Ozdemir and G. Memik and J. Zambreno and A. Choudhary", title = "Microarchitectures for Managing Chip Revenues under Process Variations", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "6", number = "2", pages = "29--32", month = feb, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.8", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "As transistor feature sizes continue to shrink into the sub-90 nm range and beyond, the effects of process variations on critical path delay and chip yields have amplified. A common concept to remedy the effects of variation is speed-binning, by which chips from a single batch are rated by a discrete range of frequencies and sold at different prices. In this paper, we discuss strategies to modify the number of chips in different bins and hence enhance the profits obtained from them. Particularly, we propose a scheme that introduces a small Substitute Cache associated with each cache way to replicate the data elements that will be stored in the high latency lines. Assuming a fixed pricing model, this method increases the revenue by as much as 13.8\% without any impact on the performance of the chips.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Cache Memories; cache memory; cache storage; Circuits; Computer Architecture; computer architecture; Computer Architecture; Computer architecture; critical path delay; Fabrication; Fault-tolerant Computing.; fixed pricing model; Frequency; Logic arrays; Microarchitecture; microarchitecture chip; microprocessor chips; Microprocessors; optimisation; process variation; Process Variations; Registers; Size control; Voltage control", } @Article{Zebchuk:2007:BBC, author = "J. Zebchuk and A. Moshovos", title = "A Building Block for Coarse-Grain Optimizations in the On-Chip Memory Hierarchy", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "6", number = "2", pages = "33--36", month = feb, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.9", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Current on-chip block-centric memory hierarchies exploit access patterns at the fine-grain scale of small blocks. Several recently proposed memory hierarchy enhancements for coherence traffic reduction and prefetching suggest that additional useful patterns emerge with a macroscopic, coarse-grain view. This paper presents RegionTracker, a dual-grain, on-chip cache design that exposes coarse-grain behavior while maintaining block-level communication. RegionTracker eliminates the extraneous, often imprecise coarse-grain tracking structures of previous proposals. It can be used as the building block for coarse-grain optimizations, reducing their overall cost and easing their adoption. Using full-system simulation of a quad-core chip multiprocessor and commercial workloads, we demonstrate that RegionTracker overcomes the inefficiencies of previous coarse-grain cache designs. We also demonstrate how RegionTracker boosts the benefits and reduces the cost of a previously proposed snoop reduction technique.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "access patterns; Bandwidth; cache storage; Cache storage; coarse-grain optimizations; coherence traffic reduction; Cost function; Design optimization; Explosions; Information management; Memory management; Multithreading; on-chip memory hierarchy; optimising compilers; Prefetching; prefetching; Proposals; quad-core chip multiprocessor; RegionTracker dual-grain on-chip cache design; system-on-chip", } @Article{Kim:2007:FBT, author = "J. Kim and J. Balfour and W. J. Dally", title = "Flattened Butterfly Topology for On-Chip Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "6", number = "2", pages = "37--40", month = feb, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.10", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "With the trend towards increasing number of cores in a multicore processors, the on-chip network that connects the cores needs to scale efficiently. In this work, we propose the use of high-radix networks in on-chip networks and describe how the flattened butterfly topology can be mapped to on-chip networks. By using high-radix routers to reduce the diameter of the network, the flattened butterfly offers lower latency and energy consumption than conventional on-chip topologies. In addition, by properly using bypass channels in the flattened butterfly network, non-minimal routing can be employed without increasing latency or the energy consumption.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Computer networks; Delay; Energy consumption; flattened butterfly; flattened butterfly topology; high-radix networks; high-radix routers; Laboratories; Multicore processing; multicore processors; Multiprocessor interconnection networks; Network topology; network topology; Network-on-a-chip; network-on-chip; on-chip networks; Routing; topology", } @Article{Xiao:2007:NPD, author = "X. Xiao and J. Lee", title = "A Novel Parallel Deadlock Detection Algorithm and Hardware for Multiprocessor System-on-a-Chip", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "6", number = "2", pages = "41--44", month = feb, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.11", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Given the projected dramatic increase in the number of processors and resources in a system-on-a-chip, a quadratic increase in the likelihood of deadlock is predicted due to complex system behavior. To deal with this issue, we here present a novel parallel hardware-oriented deadlock detection algorithm with $ O(1) $ DEADLOCK DETECTION AND $ O(\MIN (M, N)) $ preparation, where $m$ and $n$ are the numbers of processes and resources, respectively. Our contributions are (i) the first $ O(1)$ deadlock detection hardware implementation and (ii) a new algorithmic method of achieving $ O(\min (m, n))$ overall run-time complexity. We implement our algorithm in Verilog HDL and demonstrate that deadlock detection always takes only two clock cycles regardless of the size of a system (i.e., $m$ and $n$).", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Algorithms implemented in hardware; computational complexity; deadlock detection hardware; Deadlocks; Detection algorithms; Hardware design languages; microprocessor chips; Multiprocessing systems; multiprocessing systems; multiprocessor system-on-a-chip; operating systems (computers); Parallel algorithms; parallel algorithms; parallel deadlock detection algorithm; Processor scheduling; Real time systems; Real-time and embedded systems; Resource management; run-time complexity; Runtime; Software performance; System recovery; system-on-chip", } @Article{August:2007:UOS, author = "D. August and J. Chang and S. Girbal and D. Gracia-Perez and G. Mouchard and D. A. Penry and O. Temam and N. Vachharajani", title = "{UNISIM}: an Open Simulation Environment and Library for Complex Architecture Design and Collaborative Development", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "6", number = "2", pages = "45--48", month = feb, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.12", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Simulator development is already a huge burden for many academic and industry research groups; future complex or heterogeneous multi-cores, as well as the multiplicity of performance metrics and required functionality, will make matters worse. We present a new simulation environment, called UNISIM, which is designed to rationalize simulator development by making it possible and efficient to distribute the overall effort over multiple research groups, even without direct cooperation. UNISIM achieves this goal with a combination of modular software development, distributed communication protocols, multilevel abstract modeling, interoperability capabilities, a set of simulator services APIs, and an open library/repository for providing a consistent set of simulator modules.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "API; application program interfaces; Collaboration; collaborative development; complex architecture design; Computational modeling; Computer architecture; Computer industry; Computer science; Design engineering; distributed communication protocols; groupware; interoperability capability; Libraries; Measurement; modular software development; multilevel abstract modeling; open library; open repository; open simulation environment; open systems; Operating systems; Performance and Reliability; Processor Architectures; Programming; simulator development; simulator modules; simulator services; software architecture; UNISIM", } @Article{Sendag:2007:BMP, author = "R. Sendag and J. Yi and P. Chuang", title = "Branch Misprediction Prediction: Complementary Branch Predictors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "6", number = "2", pages = "49--52", month = feb, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.13", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "In this paper, we propose a new class of branch predictors, complementary branch predictors, which can be easily added to any branch predictor to improve the overall prediction accuracy. This mechanism differs from conventional branch predictors in that it focuses only on mispredicted branches. As a result, this mechanism has the advantages of scalability and flexibility (can be implemented with any branch predictor), but is not on the critical path. More specifically, this mechanism improves the branch prediction accuracy by predicting which future branch will be mispredicted next and when that will occur, and then it changes the predicted direction at the predicted time. Our results show that a branch predictor with the branch misprediction predictor achieves the same prediction accuracy as a conventional branch predictor that is 4 to 16 times larger, but without significantly increasing the overall complexity or lengthening the critical path.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Accuracy; branch misprediction prediction; branch predictor; computational complexity; Computer networks; Costs; Delay; Emerging technologies; History; parallel architectures; Performance loss; Pipeline processors; Pipelines; Prediction algorithms; Scalability; Testing", } @Article{Yalcin:2007:UTM, author = "G. Yalcin and O. Ergin", title = "Using tag-match comparators for detecting soft errors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "6", number = "2", pages = "53--56", month = feb, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.14", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Soft errors caused by high energy particle strikes are becoming an increasingly important problem in microprocessor design. With increasing transistor density and die sizes, soft errors are expected to be a larger problem in the near future. Recovering from these unexpected faults may be possible by reexecuting some part of the program only if the error can be detected. Therefore it is important to come up with new techniques to detect soft errors and increase the number of errors that are detected. Modern microprocessors employ out-of-order execution and dynamic scheduling logic. Comparator circuits, which are used to keep track of data dependencies, are usually idle. In this paper, we propose various schemes to exploit on-chip comparators to detect transient faults. Our results show that around 50\% of the errors on the wakeup logic can be detected with minimal hardware overhead by using the proposed techniques.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "and Fault-Tolerance; Broadcasting; Circuit faults; comparators (circuits); Computer errors; Control Structure Reliability; dynamic scheduling logic; Electrical fault detection; Fault detection; identification technology; Logic; logic design; logic testing; microprocessor chips; microprocessor design; Microprocessors; Out of order; out-of-order execution; Pipelines; Processor Architectures; Registers; scheduling; soft error detection; tag-match comparator; Testing; Testing and Fault-Tolerance", } @Article{Joao:2008:DPI, author = "J. A. Joao and O. Mutlu and H. Kim and Y. N. Patt", title = "Dynamic Predication of Indirect Jumps", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "1", pages = "1--4", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.2", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Indirect jumps are used to implement increasingly common programming language constructs such as virtual function calls, switch-case statements, jump tables, and interface calls. Unfortunately, the prediction accuracy of indirect jumps has remained low because many indirect jumps have multiple targets that are difficult to predict even with specialized hardware. This paper proposes a new way of handling hard-to-predict indirect jumps: dynamically predicating them. The compiler identifies indirect jumps that are suitable for predication along with their control-flow merge (CFM) points. The microarchitecture predicates the instructions between different targets of the jump and its CFM point if the jump turns out to be hardto-predict at run time. We describe the new indirect jump predication architecture, provide code examples showing why it could reduce the performance impact of jumps, derive an analytical cost-benefit model for deciding which jumps and targets to predicate, and present preliminary evaluation results.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Accuracy; Analytical models; B Hardware; B.3 Memory Structures; Cache memories; Computer languages; Computer Systems Organization; Design Styles; Hardware; Instruction sets; Microarchitecture; Multi-core/single-chip multiprocessors; Object oriented modeling; Parallel Architectures; Performance analysis; Pipelines; Processor Architectures; Switches", } @Article{Das:2008:MMC, author = "A. Das and S. Ozdemir and G. Memik and J. Zambreno and A. Choudhary", title = "Microarchitectures for Managing Chip Revenues under Process Variations", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "1", pages = "5--8", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.3", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "As transistor feature sizes continue to shrink into the sub-90nm range and beyond, the effects of process variations on critical path delay and chip yields have amplified. A common concept to remedy the effects of variation is speed-binning, by which chips from a single batch are rated by a discrete range of frequencies and sold at different prices. In this paper, we discuss strategies to modify the number of chips in different bins and hence enhance the profits obtained from them. Particularly, we propose a scheme that introduces a small substitute cache associated with each cache way to replicate the data elements that will be stored in the high latency lines. Assuming a fixed pricing model, this method increases the revenue by as much as 13.8\% without any impact on the performance of the chips.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Cache Memories; Computer Architecture; Computer architecture; Cost function; Delay effects; Design optimization; Fabrication; Fault-tolerant Computing.; Frequency; Manufacturing; Microarchitecture; Pricing; Process Variations; Transistors", } @Article{Roth:2008:PRR, author = "A. Roth", title = "Physical register reference counting", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "1", pages = "9--12", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.15", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Several proposed techniques including CPR (checkpoint processing and recovery) and NoSQ (no store queue) rely on reference counting to manage physical registers. However, the register reference counting mechanism itself has received surprisingly little attention. This paper fills this gap by describing potential register reference counting schemes for NoSQ, CPR, and a hypothetical NoSQ/CPR hybrid. Although previously described in terms of binary counters, we find that reference counts are actually more naturally represented as matrices. Binary representations can be used as an optimization in specific situations.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "and statically-scheduled implementation; binary representations; checkpoint processing; checkpointing; Counting circuits; dynamically-scheduled; dynamically-scheduled and statically-scheduled implementation; Engines; Information science; matrices; Micro-architecture implementation considerations; Microarchitecture; no store queue; physical register reference counting; Physics computing; Proposals; recovery technique; Registers; shift registers; Superscalar", } @Article{Flich:2008:LBD, author = "J. Flich and J. Duato", title = "Logic-Based Distributed Routing for {NoCs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "1", pages = "13--16", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.16", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "The design of scalable and reliable interconnection networks for multicore chips (NoCs) introduces new design constraints like power consumption, area, and ultra low latencies. Although 2D meshes are usually proposed for NoCs, heterogeneous cores, manufacturing defects, hard failures, and chip virtualization may lead to irregular topologies. In this context, efficient routing becomes a challenge. Although switches can be easily configured to support most routing algorithms and topologies by using routing tables, this solution does not scale in terms of latency and area. We propose a new circuit that removes the need for using routing tables. The new mechanism, referred to as logic-based distributed routing (LBDR), enables the implementation in NoCs of many routing algorithms for most of the practical topologies we might find in the near future in a multicore chip. From an initial topology and routing algorithm, a set of three bits per switch output port is computed. By using a small logic block, LHDR mimics (demonstrated by evaluation) the behavior of routing algorithms implemented with routing tables. This result is achieved both in regular and irregular topologies. Therefore, LBDR removes the need for using routing tables for distributed routing, thus enabling flexible, fast and power-efficient routing in NoCs.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "chip virtualization; circuit reliability; Circuit topology; Delay; Energy consumption; heterogeneous cores; interconnection network reliability; interconnections; logic circuits; logic-based distributed routing; Manufacturing; manufacturing defects; Multi-core/single-chip multiprocessors; Multicore processing; Multiprocessor interconnection networks; network routing; network topology; Network topology; Network-on-a-chip; network-on-chip; networks for multicore chips; NoC; On-chip interconnection networks; Routing; Switches", } @Article{Yoon:2008:CHP, author = "J. H. Yoon and E. H. Nam and Y. J. Seong and H. Kim and B. Kim and S. L. Min and Y. Cho", title = "{Chameleon}: a High Performance Flash\slash {FRAM} Hybrid Solid State Disk Architecture", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "1", pages = "17--20", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.17", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Flash memory solid state disk (SSD) is gaining popularity and replacing hard disk drive (HDD) in mobile computing systems such as ultra mobile PCs (UMPCs) and notebook PCs because of lower power consumption, faster random access, and higher shock resistance. One of the key challenges in designing a high-performance flash memory SSD is an efficient handling of small random writes to non-volatile data whose performance suffers from the inherent limitation of flash memory that prohibits in-placc update. In this paper, we propose a high performance Flash/FRAM hybrid SSD architecture called Chameleon. In Chameleon, metadata used by the flash translation layer (FTL), a software layer in the flash memory SSD, is maintained in a small FRAM since this metadata is a target of intensive small random writes, whereas the bulk data is kept in the flash memory. Performance evaluation based on an FPGA implementation of the Chameleon architecture shows that the use of FRAM in Chameleon improves the performance by 21.3\%. The results also show that even for bulk data that cannot be maintained in FRAM because of the size limitation, the use of fine-grained write buffering is critically important because of the inability of flash memory to perform in-placc update of data.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Chameleon; Computer architecture; Design studies; disc drives; Energy consumption; Ferroelectric films; field programmable gate arrays; flash memories; Flash memory; flash memory solid state disk; flash translation layer; flash-FRAM hybrid SSD architecture; FPGA implementation; FTL; hard discs; hard disk drive; Hard disks; HDD; Mass storage; memory architecture; Mobile computing; mobile computing systems; Nonvolatile memory; notebook PCs; Personal communication networks; Random access memory; random-access storage; Solid state circuits; SSD; ultra mobile PCs; UMPC", } @Article{Biswas:2008:CAA, author = "A. Biswas and P. Racunas and J. Emer and S. Mukherjee", title = "Computing Accurate {AVFs} using {ACE} Analysis on Performance Models: a Rebuttal", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "1", pages = "21--24", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.19", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "ACE (architecturally correct execution) analysis computes AVFs (architectural vulnerability factors) of hardware structures. AVF expresses the fraction of radiation-induced transient faults that result in user-visible errors. Architects usually perform this analysis on a high-level performance model to quickly compute per-structure AVFs. If, however, low-level details of a microarchitecture are not modeled appropriately, then their effects may not be reflected in the per-structure AVFs. In this paper we refute Wang, et al.'s (2007) claim that this detail is difficult to model and imposes a practical threshold on ACE analysis that forces its estimates to have a high error margin. We show that carefully choosing a small amount of additional detail can result in a much tighter AVF bound than Wang, et al. were able to achieve in their refined ACE analysis. Even the inclusion of small details, such as read/write pointers and appropriate inter-structure dependencies, can increase the accuracy of the AVF computation by 40\% or more. We argue that this is no different than modeling the IPC (instructions per cycle) of a microprocessor pipeline. A less detailed performance model will provide less accurate IPCs. AVFs are no different.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "and Fault-Tolerance; architectural vulnerability factors; architecturally correct execution analysis; Computational modeling; Hardware; hardware structures; High performance computing; instructions per cycle; inter-structure dependencies; Microarchitecture; microprocessor pipeline; Microprocessors; Performance analysis; Performance and Reliability; performance evaluation; performance models; Pipelines; Protection; radiation-induced transient faults; read pointers; Reliability; Target tracking; Testing; Testing and Fault-Tolerance; user-visible errors; write pointers", } @Article{Cho:2008:CAL, author = "S. Cho and R. Melhem", title = "Corollaries to {Amdahl's Law} for Energy", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "1", pages = "25--28", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.18", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper studies the important interaction between parallelization and energy consumption in a parallelizable application. Given the ratio of serial and parallel portion in an application and the number of processors, we first derive the optimal frequencies allocated to the serial and parallel regions in the application to minimize the total energy consumption, while the execution time is preserved (i.e., speedup = 1). We show that dynamic energy improvement due to parallelization has a function rising faster with the increasing number of processors than the speed improvement function given by the well-known Amdahl's Law. Furthermore, we determine the conditions under which one can obtain both energy and speed improvement, as well as the amount of improvement. The formulas we obtain capture the fundamental relationship between parallelization, speedup, and energy consumption and can be directly utilized in energy aware processor resource management. Our results form a basis for several interesting research directions in the area of power and energy aware parallel processing.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Amdahl's Law; Application software; Computer science; Concurrent computing; dynamic energy improvement; energy aware processor resource management; Energy capture; energy consumption; Energy consumption; energy consumption; Energy management; Equations; Hardware; Parallel Architectures; parallel processing; Parallel processing; parallelization; Power Management; Radio spectrum management; Resource management", } @Article{Balfour:2008:EEP, author = "J. Balfour and W. Dally and D. Black-Schaffer and V. Parikh and J. Park", title = "An Energy-Efficient Processor Architecture for Embedded Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "1", pages = "29--32", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.1", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We present an efficient programmable architecture for compute-intensive embedded applications. The processor architecture uses instruction registers to reduce the cost of delivering instructions, and a hierarchical and distributed data register organization to deliver data. Instruction registers capture instruction reuse and locality in inexpensive storage structures that arc located near to the functional units. The data register organization captures reuse and locality in different levels of the hierarchy to reduce the cost of delivering data. Exposed communication resources eliminate pipeline registers and control logic, and allow the compiler to schedule efficient instruction and data movement. The architecture keeps a significant fraction of instruction and data bandwidth local to the functional units, which reduces the cost of supplying instructions and data to large numbers of functional units. This architecture achieves an energy efficiency that is 23x greater than an embedded RISC processor.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Communication system control; compute-intensive embedded applications; Computer applications; computer architecture; Computer architecture; Costs; data movement; distributed data register organization; Embedded computing; embedded RISC processor; Embedded system; embedded systems; Energy efficiency; energy-efficient processor architecture; hierarchical organization; inexpensive storage structures; instruction registers; instruction sets; Logic; Mobile processors; pipeline processing; pipeline registers; Pipelines; Registers", } @Article{Anonymous:2008:FC, author = "Anonymous", title = "[{Front} cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "2", pages = "c1--c1", month = jul, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.15", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Presents the front cover for this issue of the publication.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2008:EBC, author = "Anonymous", title = "Editorial Board [Cover2]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "2", pages = "c2--c2", month = jul, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.16", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Provides a listing of current society officers.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Pao:2008:PAM, author = "D. Pao and W. Lin and B. Liu", title = "Pipelined Architecture for Multi-String Matching", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "2", pages = "33--36", month = jul, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.5", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This letter presents a new oblivious routing algorithm for 3D mesh networks called randomized partially-minimal (RPM) routing that provably achieves optimal worst- case throughput for 3D meshes when the network radix fc is even and within a factor of 1/k2 of optimal when k is odd. Although this optimality result has been achieved with the minimal routing algorithm OITURN for the 2D case, the worst-case throughput of OITURN degrades tremendously in higher dimensions. Other existing routing algorithms suffer from either poor worst-case throughput (DOR, ROMM) or poor latency (VAL). RPM on the other hand achieves near optimal worst-case and good average-case throughput as well as good latency performance.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "3D mesh networks; Automata; computer architecture; Computer architecture; Computer science; Costs; deterministic finite automaton; Hardware; Intrusion detection; network intrusion detection; network radix; OITURN; Partial response channels; pipelined processing; Pipelines; randomized partially-minimal routing; string matching; Table lookup; three-dimensional mesh networks; Throughput", } @Article{Ramanujam:2008:RPM, author = "R. Sunkam Ramanujam and B. Lin", title = "Randomized Partially-Minimal Routing on Three-Dimensional Mesh Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "2", pages = "37--40", month = jul, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.6", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This letter presents a new oblivious routing algorithm for 3D mesh networks called Randomized Partially- Minimal (RPM) routing that provably achieves optimal worst-case throughput for 3D meshes when the network radix k is even and within a factor of 1/k2 of optimal when k is odd. Although this optimality result has been achieved with the minimal routing algorithm O1TURN [9] for the 2D case, the worst-case throughput of O1TURN degrades tremendously in higher dimensions. Other existing routing algorithms suffer from either poor worst-case throughput (DOR [10], ROMM [8]) or poor latency (VAL [14]). RPM on the other hand achieves near optimal worst-case and good average-case throughput as well as good latency performance.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Degradation; Delay; Emerging technologies; Fabrics; Interconnection architectures; Mesh networks; Network communications; Network topology; On-chip interconnection networks; Packet-switching networks; Routing; Silicon; Technological innovation; Telecommunication traffic; Throughput", } @Article{Black-Schaffer:2008:HIR, author = "D. Black-Schaffer and J. Balfour and W. Dally and V. Parikh and J. Park", title = "Hierarchical Instruction Register Organization", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "2", pages = "41--44", month = jul, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.7", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper analyzes a range of architectures for efficient delivery of VLIW instructions for embedded media kernels. The analysis takes an efficient filter cache as a baseline and examines the benefits from (1) removing the tag overhead, (2) distributing the storage, (3) adding indirection, (4) adding efficient NOP generation, and (5) sharing instruction memory. The result is a hierarchical instruction register organization that provides a 56\% energy and 40\% area savings over an already efficient filter cache.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache storage; Cache storage; Computer aided instruction; Computer architecture; Computer integrated manufacturing; distributed shared memory systems; Embedded computing; embedded media kernel; embedded processor architecture; embedded systems; filter cache; Filters; hierarchical instruction register organization; Instruction fetch; instruction memory sharing; instruction sets; Kernel; Laboratories; Low-power design; NOP generation; parallel architectures; Registers; RISC/CISC; VLIW; VLIW architectures; VLIW instruction delivery", } @Article{Lee:2008:PDD, author = "J. Lee and X. Xiao", title = "A Parallel Deadlock Detection Algorithm with {$ O(1) $} Overall Run-time Complexity", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "2", pages = "45--48", month = jul, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.4", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This article proposes a novel parallel, hardware-oriented deadlock detection algorithm for multiprocessor system-on-chips. The proposed algorithm takes full advantage of hardware parallelism in computation and maintains information needed by deadlock detection through classifying all resource allocation events and performing class specific operations, which together make the overall run-time complexity of the new method O(1). We implement the proposed algorithm in Verilog HDL and demonstrate in the simulation that each algorithm invocation takes at most four clock cycles in hardware.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Algorithms implemented in hardware; clock cycle; Computational modeling; Concurrent computing; Deadlocks; Detection algorithms; Event detection; hardware description languages; Hardware design languages; hardware-oriented deadlock detection; Multiprocessing systems; multiprocessing systems; multiprocessor system-on-chips; operating systems (computers); parallel deadlock detection; Parallel processing; Real-time and embedded systems; resource allocation; Resource management; run-time complexity; Runtime; System recovery; system-on-chip; Verilog HDL", } @Article{GomezRequena:2008:BFT, author = "C. {Gomez Requena} and F. Gilabert Villamon and M. Gomez and P. Lopez and J. Duato", title = "Beyond Fat-tree: Unidirectional Load--Balanced Multistage Interconnection Network", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "2", pages = "49--52", month = jul, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.8", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", note = "See comment \cite{Antelo:2009:CBF}.", abstract = "The fat-tree is one of the most widely-used topologies by interconnection network manufacturers. Recently, it has been demonstrated that a deterministic routing algorithm that optimally balances the network traffic can not only achieve almost the same performance than an adaptive routing algorithm but also outperforms it. On the other hand, fat-trees require a high number of switches with a non-negligible wiring complexity. In this paper, we propose replacing the fat-tree by a unidirectional multistage interconnection network (UMIN) that uses a traffic balancing deterministic routing algorithm. As a consequence, switch hardware is almost reduced to the half, decreasing, in this way, the power consumption, the arbitration complexity, the switch size itself, and the network cost. Preliminary evaluation results show that the UMIN with the load balancing scheme obtains lower latency than fat-tree for low and medium traffic loads. Furthermore, in networks with a high number of stages or with high radix switches, it obtains the same, or even higher, throughput than fat-tree.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "adaptive routing algorithm; Butterfly Network; computational complexity; Cost-efficiency; Costs; Deterministic Routing; Energy consumption; Fat-trees; Hardware; interconnection network manufacturers; Manufacturing; Multiprocessor interconnection networks; Multistage Interconnection Networks; Network Architecture and Design; Network topology; network traffic; nonnegligible wiring complexity; power consumption; radix switches; Routing; Switches; telecommunication network routing; telecommunication switching; Telecommunication traffic; telecommunication traffic; Traffic Balancing; traffic balancing deterministic routing algorithm; trees (mathematics); unidirectional load-balanced multistage interconnection network; Wiring", } @Article{Li:2008:TAN, author = "Z. Li and C. Zhu and L. Shang and R. Dick and Y. Sun", title = "Transaction-Aware Network-on-Chip Resource Reservation", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "2", pages = "53--56", month = jul, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.9", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Performance and scalability are critically-important for on-chip interconnect in many-core chip-multiprocessor systems. Packet-switched interconnect fabric, widely viewed as the de facto on-chip data communication backplane in the many-core era, offers high throughput and excellent scalability. However, these benefits come at the price of router latency due to run-time multi-hop data buffering and resource arbitration. The network accounts for a majority of on-chip data transaction latency. In this work, we propose dynamic in-network resource reservation techniques to optimize run-time on-chip data transactions. This idea is motivated by the need to preserve existing abstraction and general-purpose network performance while optimizing for frequently-occurring network events such as data transactions. Experimental studies using multithreaded benchmarks demonstrate that the proposed techniques can reduce on-chip data access latency by 28.4\% on average in a 16-node system and 29.2\% on average in a 36-node system.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Backplanes; buffer storage; Computer buffers; data communication; Data communication; de facto on-chip data communication backplane; Delay; dynamic in-network resource reservation techniques; Fabrics; frequently-occurring network events; Interconnection architectures; Interconnections (Subsystems); many-core chip-multiprocessor systems; multiprocessor interconnection networks; Network-on-a-chip; on-chip data transaction latency; On-chip interconnection networks; packet switching; packet-switched interconnect fabric; Parallel Architectures; resource allocation; router latency; run-time multihop data buffering; Runtime; Scalability; System-on-a-chip; telecommunication network routing; Throughput; transaction-aware network-on-chip resource reservation", } @Article{Fide:2008:PUS, author = "S. Fide and S. Jenks", title = "Proactive Use of Shared {L3} Caches to Enhance Cache Communications in Multi-Core Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "2", pages = "57--60", month = jul, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.10", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The software and hardware techniques to exploit the potential of multi-core processors are falling behind, even though the number of cores and cache levels per chip is increasing rapidly. There is no explicit communications support available, and hence inter-core communications depend on cache coherence protocols, resulting in demand-based cache line transfers with their inherent latency and overhead. In this paper, we present software controlled eviction (SCE) to improve the performance of multithreaded applications running on multi-core processors by moving shared data to shared cache levels before it is demanded from remote private caches. Simulation results show that SCE offers significant performance improvement (8-28\%) and reduces L3 cache misses by 88-98\%.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache coherence protocol; cache communication; cache storage; Concurrent computing; Control systems; Degradation; Delay; demand-based cache line transfer; Hardware; intercore communications; microprocessor chips; Multi-core/single-chip multiprocessors; multi-threading; Multicore processing; multicore processors; multithreaded application; Parallel processing; Protocols; shared L3 cache; shared memory systems; software controlled eviction; Software performance; Support for multi-threaded execution", } @Article{Walter:2008:BBE, author = "I. Walter and I. Cidon and A. Kolodny", title = "{BENoC}: a Bus-Enhanced Network on-Chip for a Power Efficient {CMP}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "2", pages = "61--64", month = jul, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.11", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Network-on-chips (NoCs) outperform buses in terms of scalability, parallelism and system modularity and therefore are considered as the main interconnect infrastructure in future chip multi-processor (CMP). However, while NoCs are very efficient for delivering high throughput point-to-point data from sources to destinations, their multi-hop operation is too slow for latency sensitive signals. In addition, current NoCS are inefficient for broadcast operations and centralized control of CMP resources. Consequently, state-of-the-art NoCs may not facilitate the needs of future CMP systems. In this paper, the benefit of adding a low latency, customized shared bus as an internal part of the NoC architecture is explored. BENoC (bus-enhanced network on-chip) possesses two main advantages: First, the bus is inherently capable of performing broadcast transmission in an efficient manner. Second, the bus has lower and more predictable propagation latency. In order to demonstrate the potential benefit of the proposed architecture, an analytical comparison of the power saving in BENoC versus a standard NoC providing similar services is presented. Then, simulation is used to evaluate BENoC in a dynamic non-uniform cache access (DNUCA) multiprocessor system.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "broadcast transmission; Broadcasting; bus-enhanced network-on-chip; Centralized control; chip multiprocessor; Delay; dynamic nonuniform cache access; integrated circuit interconnections; interconnect infrastructure; Interconnection architectures; low-power electronics; microprocessor chips; multiprocessing systems; Multiprocessing systems; Multiprocessor interconnection networks; Network-on-a-chip; network-on-chip; NoC; On-chip interconnection networks; power efficient CMP; Power system interconnection; propagation latency; Scalability; system buses; System-on-a-chip; Throughput", } @Article{Golander:2008:DDS, author = "A. Golander and S. Weiss and R. Ronen", title = "{DDMR}: Dynamic and Scalable Dual Modular Redundancy with Short Validation Intervals", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "2", pages = "65--68", month = jul, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.12", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "DMR (dual modular redundancy) was suggested for increasing reliability. Classical DMR consists of pairs of cores that check each other and are pre-connected during manufacturing by dedicated links. In this paper we introduce the dynamic dual modular redundancy (DDMR) architecture. DDMR supports run-time scheduling of redundant threads, which has significant benefits relative to static binding. To allow dynamic pairing, DDMR replaces the special links with a novel ring architecture. DDMR uses short instruction sequences for validation, smaller than the processor reorder buffer. Such short sequences reduce latencies in parallel programs and save resources needed to buffer uncommitted data. DDMR scales with the number of cores and may be used in large multicore architectures.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "buffer storage; DDMR; Delay; dynamic dual modular redundancy; Job shop scheduling; Joining processes; Manufacturing; Multi-core/single-chip multiprocessors; multicore architectures; Multicore processing; parallel architectures; parallel programs; processor reorder buffer; processor scheduling; Processor scheduling; Proposals; Redundancy; Redundant design; ring architecture; run-time scheduling; scalable dual modular redundancy; short validation intervals; Transistors", } @Article{Anonymous:2008:IA, author = "Anonymous", title = "Information for authors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "2", pages = "c3--c3", month = jul, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.17", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Provides instructions and guidelines to prospective authors who wish to submit manuscripts.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2008:ICS, author = "Anonymous", title = "{IEEE Computer Society} [Cover 4]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "2", pages = "c4--c4", month = jul, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.18", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Provides a listing of current society officers.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Ramanujam:2009:WRR, author = "Rohit Sunkam Ramanujam and Bill Lin", title = "Weighted Random Routing on Torus Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "1--4", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.14", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "In this paper, we introduce a new closed-form oblivious routing algorithm called W2TURN that is worst-case throughput optimal for 2D-torus networks. W2TURN is based on a weighted random selection of paths that contain at most two turns. In terms of average hop count, W2TURN outperforms the best previously known closed-form worst-case throughput optimal routing algorithm called IVAL [7]. In addition, we present a new optimal weighted random routing algorithm for rings called WRD.", acknowledgement = ack-nhfb, affiliation = "Ramanujam, RS (Reprint Author), Univ Calif San Diego, San Diego, CA 92103 USA. Ramanujam, Rohit Sunkam; Lin, Bill, Univ Calif San Diego, San Diego, CA 92103 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "rsunkamr@ucsd.edu billlin@ucsd.edu", da = "2019-06-20", doc-delivery-number = "V17GC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "2D-torus networks; Algorithm design and analysis; closed-form oblivious routing algorithm; Data communications; Delay; Interconnection network; internetworking; IVAL; latency; Measurement; Multiprocessor interconnection networks; Network-on-a-chip; oblivious routing; Oblivious Routing; On-chip interconnection networks; optimal weighted random routing algorithm; Routing; Runtime; System recovery; telecommunication network routing; throughput; Throughput; torus network; Torus Network; W2TURN; weighted random path selection", number-of-cited-references = "8", ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X", research-areas = "Computer Science", researcherid-numbers = "Lin, Binshan/A-9772-2009", times-cited = "2", unique-id = "Ramanujam:2009:WRR", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Ahn:2009:MDE, author = "Jung Ho Ahn and Jacob Leverich and Robert S. Schreiber and Norman P. Jouppi", title = "Multicore {DIMM}: an Energy Efficient Memory Module with Independently Controlled {DRAMs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "5--8", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.13", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Demand for memory capacity and bandwidth keeps increasing rapidly in modern computer systems, and memory power consumption is becoming a considerable portion of the system power budget. However, the current DDR DIMM standard is not well suited to effectively serve CMP memory requests from both a power and performance perspective. We propose a new memory module called a Multicore DIMM, where DRAM chips are grouped into multiple virtual memory devices, each of which has its own data path and receives separate commands (address and control signals). The Multicore DIMM is designed to improve the energy efficiency of memory systems with small impact on system performance. Dividing each memory modules into 4 virtual memory devices brings a simultaneous 22\%, 7.6\%, and 18\% improvement in memory power, IPC, and system energy-delay product respectively on a set of multithreaded applications and consolidated workloads.", acknowledgement = ack-nhfb, affiliation = "Ahn, JH (Reprint Author), Hewlett Packard Labs, Mississauga, ON, Canada. Ahn, Jung Ho; Schreiber, Robert S.; Jouppi, Norman P., Hewlett Packard Labs, Mississauga, ON, Canada. Leverich, Jacob, Stanford Univ, Stanford, CA 94305 USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "V17GC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; CMP memory requests; Control systems; DDR DIMM standard; DRAM; DRAM chips; Energy consumption; Energy efficiency; energy efficiency; energy efficient memory module; Energy-aware systems; Error correction codes; independently controlled DRAM; Jacobian matrices; memory capacity; memory module; memory power consumption; Memory Structures; memory system; microprocessor chips; Multicore; multicore DIMM; Multicore processing; Proposals; Random access memory; System performance; system power budget; virtual memory devices", number-of-cited-references = "16", ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394", research-areas = "Computer Science", researcherid-numbers = "Ahn, Jung Ho/D-1298-2013", times-cited = "26", unique-id = "Ahn:2009:MDE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Wang:2009:PST, author = "Po-Han Wang and Yen-Ming Chen and Chia-Lin Yang and Yu-Jung Cheng", title = "A Predictive Shutdown Technique for {GPU} Shader Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "9--12", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.1", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "As technology continues to shrink, reducing leakage is critical to achieve energy efficiency. Previous works on low-power GPU (Graphics Processing Unit) focus on techniques for dynamic power reduction, such as DVFS (Dynamic Voltage/Frequency Scaling) and clock gating. In this paper, we explore the potential of adopting architecture-level power gating techniques for leakage reduction on GPU. In particular, we focus on the most power-hungry components, shader processors. We observe that, due to different scene complexity, the required shader resources to satisfy the target frame rate actually vary across frames. Therefore, we propose the Predictive Shader Shutdown technique to exploit workload variation across frames for leakage reduction on shader processors. The experimental results show that Predictive Shader Shutdown achieves up to 46\% leakage reduction on shader processors with negligible performance degradation.", acknowledgement = ack-nhfb, affiliation = "Wang, PH (Reprint Author), Natl Taiwan Univ, Dept Comp Sci \& Informat Engn, Taipei 10764, Taiwan. Wang, Po-Han; Chen, Yen-Ming; Yang, Chia-Lin, Natl Taiwan Univ, Dept Comp Sci \& Informat Engn, Taipei 10764, Taiwan. Cheng, Yu-Jung, Natl Taiwan Univ, Grad Inst Networking \& Multimedia, Taipei 10764, Taiwan.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "r96002@csie.ntu.edu.tw r95125@csie.ntu.edu.tw yangc@csie.ntu.edu.tw d96944002@ntu.edu.tw", da = "2019-06-20", doc-delivery-number = "V17GC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Institute for Information Industry of Taiwan [97-FS-C03]; National Taiwan University [97R0062-05]", funding-text = "This work was partially supported by the Institute for Information Industry of Taiwan under project No. 97-FS-C03, and by the Excellent Research Projects of National Taiwan University, 97R0062-05.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "architecture-level power gating techniques; Central Processing Unit; Circuits; clock gating; Clocks; computer architecture; computer graphic equipment; Computer science; coprocessors; Degradation; dynamic power reduction; Dynamic voltage scaling; dynamic voltage-frequency scaling; Energy efficiency; Energy-aware systems; Frequency; GPU; GPU shader processors; Graphics; graphics processing unit; Layout; leakage; Low-power design; power aware computing; power gating; predictive shader shutdown technique", number-of-cited-references = "15", ORCID-numbers = "YANG, CHIA-LIN/0000-0003-0091-5027", research-areas = "Computer Science", times-cited = "10", unique-id = "Wang:2009:PST", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Barnes:2009:XBA, author = "Christopher Barnes and Pranav Vaidya and Jaehwan John Lee", title = "An {XML}-Based {ADL} Framework for Automatic Generation of Multithreaded Computer Architecture Simulators", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "13--16", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.2", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Computer architecture simulation has always played a pivotal role in continuous innovation of computers. However, constructing or modifying a high quality simulator is time consuming and error-prone. Thus, often Architecture Description Languages (ADLs) are used to provide an abstraction layer for describing the computer architecture and automatically generating corresponding simulators. Along the line of such research, we present a novel XML-based ADL, its compiler, and a generation methodology to automatically generate multithreaded simulators for computer architecture. We utilize the industry-standard extensible markup language XML to describe the functionality and architecture of a modeled processor. Our ADL framework allows users to easily and quickly modify the structure, register set, and execution of a modeled processor. To prove its validity, we have generated several multithreaded simulators with different configurations based on the MIPS five-stage processor, and successfully tested with two programs.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "V17GC", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "IUPUI RSFG", funding-text = "This research was funded by the IUPUI RSFG grant.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "abstraction layer; Architecture description languages; automatic generation; C.0.d Modeling of computer architecture; C.1.1.b Pipeline processors; Computational modeling; computer architecture; Computer architecture; Computer simulation; Concurrent computing; extensible markup language-architecture description language; Kernel; MIPS five-stage processor; Modeling of computer architecture; multi-threading; multithreaded computer architecture simulator; Object oriented modeling; Pipeline processors; Pipelines; program compilers; program verification; Testing; validity testing; XML; XML-based ADL framework", number-of-cited-references = "14", research-areas = "Computer Science", times-cited = "0", unique-id = "Barnes:2009:XBA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Luque:2009:CAC, author = "Carlos Luque and Miquel Moreto and Francisco J. Cazorla and Roberto Gioiosa and Alper Buyuktosunoglu and Mateo Valero", title = "{CPU} Accounting in {CMP} Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "17--20", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.3", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Chip-MultiProcessors (CMP) introduce complexities when accounting CPU utilization to processes because the progress done by a process during an interval of time highly depends on the activity of the other processes it is co-scheduled with. We propose a new hardware accounting mechanism to improve the accuracy when measuring the CPU utilization in CMPs and compare it with the previous accounting mechanisms. Our results show that currently known mechanisms could lead to a 12\% average error when it comes to CPU utilization accounting. Our proposal reduces this error to less than 1\% in a modeled 4-core processor system.", acknowledgement = ack-nhfb, affiliation = "Luque, C (Reprint Author), Univ Politecn Cataluna, E-08028 Barcelona, Spain. Luque, Carlos; Moreto, Miquel; Valero, Mateo, Univ Politecn Cataluna, E-08028 Barcelona, Spain. Cazorla, Francisco J.; Valero, Mateo, Barcelona Supercomp Ctr, Barcelona, Spain.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "V17GC", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Ministry of Science and Technology of Spain [TIN-2007-60625, BES-2008-003683, AP-2005-3318]; HiPEAC Network of Excellence [IST-004408]; IBM Research; IBM Deep Computing organizations", funding-text = "This work has been supported by the Ministry of Science and Technology of Spain under contract TIN-2007-60625 and grants BES-2008-003683 and AP-2005-3318, by the HiPEAC Network of Excellence (IST-004408) and a Collaboration Agreement between IBM and BSC with funds from IBM Research and IBM Deep Computing organizations. The authors would like to thank Pradip Bose and Chen-Yong Cher from IBM for their technical support.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "4-core processor system; Bandwidth; Cache memory; chip-multiprocessor architecture; Clocks; CMP processor system; CPU utilization accounting; data center; General; Hardware; hardware accounting mechanism; Hardware/software interfaces; Kernel; microprocessor chips; Multi-core/single-chip multiprocessors; multiprocessing systems; operating system task scheduling; Operating systems; process scheduling; processor scheduling; Proposals; resource allocation; Semiconductor device measurement; Switches", number-of-cited-references = "11", oa = "Green Published", ORCID-numbers = "Moreto Planas, Miquel/0000-0002-9848-8758 Cazorla, Francisco/0000-0002-3344-376X Luque, Carlos/0000-0003-0442-0785 Valero, Mateo/0000-0003-2917-2482 Gioiosa, Roberto/0000-0001-9430-2656", research-areas = "Computer Science", researcherid-numbers = "Moreto Planas, Miquel/C-1823-2016 Cazorla, Francisco/D-7261-2016 Luque, Carlos/E-2110-2019 Valero, Mateo/L-5709-2014", times-cited = "5", unique-id = "Luque:2009:CAC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Soteriou:2009:HTD, author = "Vassos Soteriou and Rohit Sunkam Ramanujam and Bill Lin and Li-Shiuan Peh", title = "A High-Throughput Distributed Shared-Buffer {NoC} Router", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "21--24", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.5", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Microarchitectural configurations of buffers in routers have a significant impact on the overall performance of an on-chip network (NoC). This buffering can be at the inputs or the outputs of a router, corresponding to an input-buffered router (IBR) or an output-buffered router (OBR). OBRs are attractive because they have higher throughput and lower queuing delays under high loads than IBRs. However, a direct implementation of OBRs requires a router speedup equal to the number of ports, making such a design prohibitive given the aggressive clocking and power budgets of most NoC applications. In this letter, we propose a new router design that aims to emulate an OBR practically based on a distributed shared-buffer (DSB) router architecture. We introduce innovations to address the unique constraints of NoCs, including efficient pipelining and novel flow control. Our DSB design can achieve significantly higher bandwidth at saturation, with an improvement of up to 20\% when compared to a state-of-the-art pipelined IBR with the same amount of buffering, and our proposed microarchitecture can achieve up to 94\% of the ideal saturation throughput.", acknowledgement = ack-nhfb, affiliation = "Ramanujam, Rohit Sunkam; Lin, Bill, Univ Calif San Diego, San Diego, CA 92103 USA. Peh, Li-Shiuan, Princeton Univ, Princeton, NJ 08544 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "vassos.soteriou@cut.ac.cy rsunkamr@ucsd.edu billlin@ucsd.edu peh@princeton.edu", da = "2019-06-20", doc-delivery-number = "V17GC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; buffer circuits; Clocks; Computer architecture; configuration management; Delay; distributed shared-buffer; Interconnection architectures; Internet; microarchitectural configurations; Microarchitecture; network routing; Network-on-a-chip; network-on-chip; NoC router; On-chip interconnection networks; output-buffered router; Pipeline processing; router architecture; Router micro-architecture; Technological innovation; Throughput", keywords-plus = "ARCHITECTURE", number-of-cited-references = "16", ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X Soteriou, Vassos/0000-0002-2818-0459", research-areas = "Computer Science", researcherid-numbers = "Lin, Binshan/A-9772-2009 Soteriou, Vassos/H-4603-2014", times-cited = "15", unique-id = "Soteriou:2009:HTD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Guz:2009:MCV, author = "Zvika Guz and Evgeny Bolotin and Idit Keidar and Avinoam Kolodny and Avi Mendelson and Uri C. Weiser", title = "Many-Core vs. Many-Thread Machines: Stay Away From the Valley", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "25--28", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.4", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We study the tradeoffs between Many-Core machines like Intel's Larrabee and Many-Thread machines like Nvidia and AMD GPGPUs. We define a unified model describing a superposition of the two architectures, and use it to identify operation zones for which each machine is more suitable. Moreover, we identify an intermediate zone in which both machines deliver inferior performance. We study the shape of this ``performance valley'' and provide insights on how it can be avoided.", acknowledgement = ack-nhfb, affiliation = "Guz, Z (Reprint Author), Technion Israel Inst Technol, EE Dept, IL-32000 Haifa, Israel. Guz, Zvika; Keidar, Idit; Kolodny, Avinoam; Weiser, Uri C., Technion Israel Inst Technol, EE Dept, IL-32000 Haifa, Israel. Bolotin, Evgeny, Intel Corp, Santa Clara, CA 95051 USA. Mendelson, Avi, Microsoft Corp, Redmond, WA 98052 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "zguz@tx.technion.ac.il evgeny.bolotin@intel.com idish@ee.technion.ac.il kolodny@ee.technion.ac.il avim@microsoft.com uri.weiser@ee.technion.ac.il", da = "2019-06-20", doc-delivery-number = "V17GC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Semiconductors Research Corporation (SRC); Intel; Israeli Ministry of Science Knowledge Center on Chip MultiProcessors", funding-text = "We thank Ronny Ronen, Michael Behar, and Roni Rosner. This work was partially supported by Semiconductors Research Corporation (SRC), Intel, and the Israeli Ministry of Science Knowledge Center on Chip MultiProcessors.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "AMD GPGPU; architecture superposition; Bandwidth; Chip Multiprocessors; Computer Systems; coprocessors; Delay; Engines; Equations; GPGPU; Graphics; Intelpsilas Larrabee; many-core machines; many-thread machines; Multi-core/single-chip multiprocessors; multi-threading; multiprocessing systems; Nvidia GPGPU; Parallel Architectures; parallel architectures; Parallel processing; performance valley; Processor Architectures; Shape", number-of-cited-references = "9", research-areas = "Computer Science", times-cited = "27", unique-id = "Guz:2009:MCV", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Desai:2009:AIC, author = "Aniruddha Desai and Jugdutt Singh", title = "Architecture Independent Characterization of Embedded {Java} Workloads", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "29--32", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.7", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "This paper presents architecture independent characterization of embedded Java workloads based on the industry standard GrinderBench benchmark which includes different classes of real world embedded Java applications. This work is based on a custom built embedded Java Virtual Machine (JVM) simulator specifically designed for embedded JVM modeling and embodies domain specific details such as thread scheduling, algorithms used for native CLDC APIs and runtime data structures optimized for use in embedded systems. The results presented include dynamic execution characteristics, dynamic bytecode instruction mix, application and API workload distribution, Object allocation statistics, instruction-set coverage, memory usage statistics and method code and stack frame characteristics.", acknowledgement = ack-nhfb, affiliation = "Desai, A (Reprint Author), La Trobe Univ, Bundoora, Vic 3086, Australia. Desai, Aniruddha; Singh, Jugdutt, La Trobe Univ, Bundoora, Vic 3086, Australia.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "desai@ieee.org", da = "2019-06-20", doc-delivery-number = "V17GC", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Algorithm design and analysis; application program interfaces; architecture independent characterization; CLDC API; custom built embedded Java virtual machine simulator; data structures; Data structures; Design optimization; dynamic bytecode instruction mix; dynamic execution characteristics; embedded Java workload; Embedded Systems; embedded systems; Embedded Systems; industry standard GrinderBench benchmark; instruction sets; instruction-set coverage; Java; Java bytecode; Job shop scheduling; JVM; memory usage statistics; method code characteristics; multi-threading; object allocation statistics; Runtime; runtime data structure; scheduling; Scheduling algorithm; stack frame characteristics; Statistical distributions; storage allocation; thread scheduling; virtual machines; Virtual machining; Workload Characterization", number-of-cited-references = "8", research-areas = "Computer Science", times-cited = "0", unique-id = "Desai:2009:AIC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Antelo:2009:CBF, author = "Elisardo Antelo", title = "A Comment on {``Beyond Fat-tree: Unidirectional Load-Balanced Multistage Interconnection Network''}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "33--34", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.6", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", note = "See \cite{GomezRequena:2008:BFT}.", abstract = "A recent work proposed to simplify fat-trees with adaptive routing by means of a load-balancing deterministic routing algorithm. The resultant network has performance figures comparable to the more complex adaptive routing fat-trees when packets need to be delivered in order. In a second work by the same authors published in IEEE CAL, they propose to simplify the fat-tree to a unidirectional multistage interconnection network (UMIN), using the same load-balancing deterministic routing algorithm. They show that comparable performance figures are achieved with much lower network complexity. In this comment we show that the proposed load-balancing deterministic routing is in fact the routing scheme used by the butterfly network. Moreover we show that the properties of the simplified UMIN network proposed by them are intrinsic to the standard butterfly and other existing UMINs", acknowledgement = ack-nhfb, affiliation = "Antelo, E (Reprint Author), Univ Santiago de Compostela, Dept Elect \& Comp Sci, Santiago De Compostela, Spain. Univ Santiago de Compostela, Dept Elect \& Comp Sci, Santiago De Compostela, Spain.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "V17GC", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "adaptive routing fat-trees; Bismuth; butterfly network; Computer science; deterministic algorithms; fat-tree; hypercube networks; Interconnection networks; Interconnections (Subsystems); load balancing deterministic routing algorithm; Logic functions; Multiprocessor interconnection networks; Multistage Interconnection networks; network complexity; Network topology; packets; resource allocation; Routing; Switches; Technological innovation; Topology; unidirectional load-balanced multistage interconnection network; unidirectional multistage interconnection network", number-of-cited-references = "7", ORCID-numbers = "Antelo, Elisardo/0000-0003-3743-3689", research-areas = "Computer Science", times-cited = "0", unique-id = "Antelo:2009:CBF", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2009:Aa, author = "Anonymous", title = "{[Advertisement]}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "35--35", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.38", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2009:AIC, author = "Anonymous", title = "Ad --- {IEEE Computer Society Digital Library}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "36--36", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.39", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2009:EBCa, author = "Anonymous", title = "Editorial Board [Cover2]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "c2--c2", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.41", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2009:FCa, author = "Anonymous", title = "[{Front} cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "c1--c1", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.40", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2009:IAa, author = "Anonymous", title = "Information for authors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "c3--c3", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.42", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2009:ICSa, author = "Anonymous", title = "{IEEE Computer Society} [Cover4]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "c4--c4", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.43", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Gaudiot:2009:INE, author = "Jean-Luc Gaudiot", title = "Introducing the New {Editor-in-Chief} of {{\booktitle{IEEE Computer Architecture Letters}}}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "37--38", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.60", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "V17GD", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", number-of-cited-references = "0", research-areas = "Computer Science", times-cited = "0", unique-id = "Gaudiot:2009:INE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Skadron:2009:LE, author = "K. Skadron", title = "Letter from the {Editor}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "39--39", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.61", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Skadron:2009:U, author = "Kevin Skadron", title = "Untitled", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "39--39", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.61", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "V17GD", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", number-of-cited-references = "0", research-areas = "Computer Science", times-cited = "0", unique-id = "Skadron:2009:U", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Xin:2009:ELI, author = "Jing Xin and Russ Joseph", title = "Exploiting Locality to Improve Circuit-level Timing Speculation", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "40--43", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.50", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Circuit-level timing speculation has been proposed as a technique to reduce dependence on design margins, eliminating power and performance overheads. Recent work has proposed microarchitectural methods to dynamically detect and recover from timing errors in processor logic. This work has not evaluated or exploited the disparity of error rates at the level of static instructions. In this paper, we demonstrate pronounced locality in error rates at the level of static instructions. We propose timing error prediction to dynamically anticipate timing errors at the instruction-level and reduce the costly recovery penalty. This allows us to achieve 43.6\% power savings when compared to a baseline policy and incurs only 6.9\% performance penalty.", acknowledgement = ack-nhfb, affiliation = "Xin, J (Reprint Author), Northwestern Univ, Evanston, IL 60208 USA. Xin, Jing; Joseph, Russ, Northwestern Univ, Evanston, IL 60208 USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "V17GD", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [CCF-0644332, CNS-0720820]", funding-text = "Manuscript submitted: 17-Sep-2009. Manuscript accepted: 08-Oct-2009. Final manuscript received: 15-Oct-2009. We thank the anonymous reviewers for their constructive feedback. This work was supported by NSF awards CAREER CCF-0644332 and CNS-0720820.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Circuit faults; circuit reliability; circuit-level timing speculation; Costs; Delay; Dynamic voltage scaling; Error analysis; Error locality; Frequency; Hardware; instruction sets; Logic; logic design; low-power design; Low-power design; microarchitectural methods; microprocessor chips; Pipelines; power elimination; processor logic; reliability; Reliability; static instruction level; Testing and Fault-Tolerance; Timing; timing error prediction; timing speculation", number-of-cited-references = "12", research-areas = "Computer Science", times-cited = "1", unique-id = "Xin:2009:ELI", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Sudarsanam:2009:PPD, author = "Arvind Sudarsanam and Ramachandra Kallam and Aravind Dasu", title = "{PRR--PRR} Dynamic Relocation", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "44--47", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.49", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Partial bitstream relocation (PBR) on FPGAs has been gaining attention in recent years as a potentially promising technique to scale parallelism of accelerator architectures at run time, enhance fault tolerance, etc. PBR techniques to date have focused on reading inactive bitstreams stored in memory, on-chip or off-chip, whose contents are generated for a specific partial reconfiguration region (PRR) and modified on demand for configuration into a PRR at a different location. As an alternative, we propose a PRR-PRR relocation technique to generate source and destination addresses, read the bitstream from an active PRR (source) in a non-intrusive manner, and write it to destination PRR. We describe two options of realizing this on Xilinx Virtex 4 FPGAs: (a) hardware-based accelerated relocation circuit (ARC) and (b) a software solution executed on Microblaze. A comparative performance analysis to highlight the speed-up obtained using ARC is presented. For real test cases, performance of our implementations are compared to estimated performances of two state of the art methods.", acknowledgement = ack-nhfb, affiliation = "Sudarsanam, A (Reprint Author), Utah State Univ, Dept Elect \& Comp Engn, Logan, UT 84321 USA. Sudarsanam, Arvind; Kallam, Ramachandra; Dasu, Aravind, Utah State Univ, Dept Elect \& Comp Engn, Logan, UT 84321 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "arvind.sudarsanam@aggiemail.usu.edu ramachandra.kallam@aggiemail.usu.edu dasu@engineering.usu.edu", da = "2019-06-20", doc-delivery-number = "V17GD", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NASA; Micron Research Center", funding-text = "Manuscript submitted: 03-Aug-2009. Manuscript accepted: 16-Sep-2009. Final manuscript received: 24-Sep-2009. This work was supported by NASA and Micron Research Center.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Acceleration; Accelerator architectures; accelerator architectures; Bioreactors; Circuits; destination address; Emerging technologies; Fault tolerance; fault tolerance; field programmable gate arrays; Field programmable gate arrays; Filters; FPGAs; Hardware; hardware-based accelerated relocation circuit; parallel architecture; parallel architectures; Parallel processing; partial bitstream relocation; Partial dynamic reconfiguration; Partial dynamic relocation; partial reconfiguration region; PBR techniques; Performance analysis; Performance Analysis and Design Aids; PRR-PRR dynamic relocation technique; PRR-PRR relocation technique; Reconfigurable computing; Reconfigurable hardware; source address; Xilinx Virtex 4 FPGA", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "9", unique-id = "Sudarsanam:2009:PPD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Leverich:2009:PMD, author = "Jacob Leverich and Matteo Monchiero and Vanish Talwar and Partha Ranganathan and Christos Kozyrakis", title = "Power Management of Datacenter Workloads Using Per-Core Power Gating", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "48--51", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.46", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "While modern processors offer a wide spectrum of software-controlled power modes, most datacenters only rely on Dynamic Voltage and Frequency Scaling (DVFS, a.k.a. P-states) to achieve energy efficiency. This paper argues that, in the case of datacenter workloads, DVFS is not the only option for processor power management. We make the case for per-core power gating (PCPG) as an additional power management knob for multi-core processors. PCPG is the ability to cut the voltage supply to selected cores, thus reducing to almost zero the leakage power for the gated cores. Using a testbed based on a commercial 4-core chip and a set of real-world application traces from enterprise environments, we have evaluated the potential of PCPG. We show that PCPG can significantly reduce a processor's energy consumption (up to 40\%) without significant performance overheads. When compared to DVFS, PCPG is highly effective saving up to 30\% more energy than DVFS. When DVFS and PCPG operate together they can save up to almost 60\%.", acknowledgement = ack-nhfb, affiliation = "Leverich, J (Reprint Author), Hewlett Packard Labs, Mississauga, ON, Canada. Leverich, Jacob; Monchiero, Matteo; Talwar, Vanish; Ranganathan, Partha, Hewlett Packard Labs, Mississauga, ON, Canada. Leverich, Jacob; Kozyrakis, Christos, Stanford Univ, Stanford, CA 94305 USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "V17GD", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Application software; computer centres; Costs; data center workloads; dynamic voltage and frequency scaling; Dynamic voltage scaling; Energy consumption; energy efficiency; Energy management; Energy-aware systems; enterprise environments; Frequency; integration and modeling; Jacobian matrices; leakage power; microprocessor chips; Multicore processing; multicore processors; per-core power gating; power consumption; Power supplies; processor energy consumption; processor power management; software-controlled power modes; System architectures; Testing", number-of-cited-references = "10", oa = "Green Published", research-areas = "Computer Science", times-cited = "43", unique-id = "Leverich:2009:PMD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Musoll:2009:PVA, author = "Enric Musoll", title = "A Process-Variation Aware Technique for Tile-Based, Massive Multicore Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "52--55", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.48", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Process variations in advanced nodes introduce significant core-to-core performance differences in single-chip multicore architectures. Isolating each core with its own frequency and voltage island helps improving the performance of the multi-core architecture by operating at the highest frequency possible rather than operating all the cores at the frequency of the slowest core. However, inter-core communication suffers from additional cross-clock-domain latencies that can offset the performance benefits. This work proposes the concept of the configurable, variable-size frequency and voltage domain, and it is described in the context of a tile-based, massive multi-core architecture.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "V17GD", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Clocks; computer architecture; Context; cross-clock-domain latency; Delay; Frequency; intercore communication; massive multi-core; massive multicore processors; Multi-core/single-chip multiprocessors; multicore architecture; Multicore processing; Network-on-a-chip; network-on-chip; On-chip interconnection networks; Performance gain; Process design; process-variation aware architecture; process-variation aware technique; Runtime; single-chip multicore architectures; tile-base architecture; tile-based multicore processors; variable-size frequency domain; Voltage; voltage domain", number-of-cited-references = "5", research-areas = "Computer Science", times-cited = "3", unique-id = "Musoll:2009:PVA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Baldassin:2009:CEC, author = "Alexandro Baldassin and Felipe Klein and Guido Araujo and Rodolfo Azevedo and Paulo Centoducatte", title = "Characterizing the Energy Consumption of Software Transactional Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "56--59", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.47", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The well-known drawbacks imposed by lock-based synchronization have forced researchers to devise new alternatives for concurrent execution, of which transactional memory is a promising one. Extensive research has been carried out on Software Transaction Memory (STM), most of all concentrated on program performance, leaving unattended other metrics of great importance like energy consumption. This letter presents a thorough evaluation of energy consumption in a state-of-the-art STM. We show that energy and performance results do not always follow the same trend and, therefore, it might be appropriate to consider different strategies depending on the focus of the optimization. We also introduce a novel strategy based on dynamic voltage and frequency scaling for contention managers, revealing important energy and energy-delay product improvements in high-contended scenarios. This work is a first study towards a better understanding of the energy consumption behavior of STM systems, and could prompt STM designers to research new optimizations in this area, paving the way for an energy-aware transactional memory.", acknowledgement = ack-nhfb, affiliation = "Baldassin, A (Reprint Author), Univ Estadual Campinas, Inst Comp, Campinas, SP, Brazil. Baldassin, Alexandro; Klein, Felipe; Araujo, Guido; Azevedo, Rodolfo; Centoducatte, Paulo, Univ Estadual Campinas, Inst Comp, Campinas, SP, Brazil.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "alebal@ic.unicamp.br klein@ic.unicamp.br guido@ic.unicamp.br rodolfo@ic.unicamp.br ducatte@ic.unicamp.br", da = "2019-06-20", doc-delivery-number = "V17GD", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "FAPESP [2005/02565-9]", funding-text = "Manuscript submitted: 02-Jul-2009. Manuscript accepted: 23-Jul-2009. Final manuscript received: 05-Aug-2009. This work was supported in part by FAPESP (2005/02565-9).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Concurrent computing; Concurrent Programming; Content management; Costs; Dynamic voltage scaling; Energy Consumption; Energy consumption; energy consumption; Energy management; Energy-aware systems; energy-delay product improvements; frequency scaling; Frequency synchronization; Hardware; lock-based synchronization; Measurement techniques; Memory management; multiprocessing systems; Multiprocessor Systems; multiprocessor systems; Multiprocessor Systems; Parallel Architectures; parallel architectures; Power Management; Software performance; software transactional memory; synchronisation; transaction processing; Transactional Memory", number-of-cited-references = "13", ORCID-numbers = "Azevedo, Rodolfo/0000-0002-8803-0401", research-areas = "Computer Science", researcherid-numbers = "Azevedo, Rodolfo/F-3008-2012", times-cited = "3", unique-id = "Baldassin:2009:CEC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Balfour:2009:ORE, author = "James Balfour and R. Curtis Harting and William J. Dally", title = "Operand Registers and Explicit Operand Forwarding", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "60--63", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.45", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Operand register files are small, inexpensive register files that are integrated with function units in the execute stage of the pipeline, effectively extending the pipeline operand registers into register files. Explicit operand forwarding lets software opportunistically orchestrate the routing of operands through the forwarding network to avoid writing ephemeral values to registers. Both mechanisms let software capture short-term reuse and locality close to the function units, improving energy efficiency by allowing a significant fraction of operands to be delivered from inexpensive registers that are integrated with the function units. An evaluation shows that capturing operand bandwidth close to the function units allows operand registers to reduce the energy consumed in the register files and forwarding network of an embedded processor by 61\%, and allows explicit forwarding to reduce the energy consumed by 26\%.", acknowledgement = ack-nhfb, affiliation = "Balfour, J (Reprint Author), Stanford Univ, Comp Syst Lab, Stanford, CA 94305 USA. Balfour, James; Harting, R. Curtis; Dally, William J., Stanford Univ, Comp Syst Lab, Stanford, CA 94305 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "jbalfour@cva.stanford.edu dally@cva.stanford.edu", da = "2019-06-20", doc-delivery-number = "V17GD", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Code generation; Computer aided instruction; Computer System Implementation; Computer Systems Organizat; embedded processor; Energy capture; energy consumption; energy efficient register organization; explicit operand forwarding; explicit operand forwarding network; Fixed-point arithmetic; impact of technology trends; Impact of VLSI on system design; Laboratories; Logic; low-power programmable processors; Memory hierarchy; microprocessor chips; operand bandwidth; operand register files; operand registers; Optimization; Physically aware micro-architecture: power; Pipelines; Real-time and embedded systems; Registers; Routing; software reusability; thermal; VLSI Systems; Writing", number-of-cited-references = "9", research-areas = "Computer Science", times-cited = "9", unique-id = "Balfour:2009:ORE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Chiou:2009:AFF, author = "Derek Chiou and Hari Angepat and Nikhil A. Patil and Dam Sunwoo", title = "Accurate Functional-First Multicore Simulators", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "64--67", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.44", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Fast and accurate simulation of multicore systems requires a parallelized simulator. This paper describes a novel method to build parallelizable and cycle-accurate-capable functional-first simulators of multicore targets.", acknowledgement = ack-nhfb, affiliation = "Chiou, D (Reprint Author), Univ Texas Austin, Dept Elect \& Comp Engn, Austin, TX 78712 USA. Chiou, Derek; Angepat, Hari; Patil, Nikhil A.; Sunwoo, Dam, Univ Texas Austin, Dept Elect \& Comp Engn, Austin, TX 78712 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "derek@ece.utexas.edu angepat@ece.utexas.edu npatil@ece.utexas.edu sunwoo@ece.utexas.edu", da = "2019-06-20", doc-delivery-number = "V17GD", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Science Foundation [0615352, 0747438]", funding-text = "This material is based upon work supported by the National Science Foundation under Grants No. 0615352 and No. 0747438 and gifts from Intel and IBM. We thank the anonymous reviewers for their comments.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "circuit simulation; Computational modeling; Computer simulation; field programmable gate arrays; FPGA-accelerated simulation technologies; functional-first multicore simulators; Instruction sets; integration and modeling; Microarchitecture; Modeling and Visualization; Modeling of computer architecture; Modeling techniques; Multi-core/single-chip multiprocessors; Multicore processing; multicore system simulation; Parallel; Parallel Architectures; parallelized simulator; Performance Analysis and Design Aids; Predictive models; Simulation; Software prototyping; System architectures; Timing; Virtual machining; Virtual prototyping", number-of-cited-references = "17", research-areas = "Computer Science", times-cited = "7", unique-id = "Chiou:2009:AFF", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2009:Ab, author = "Anonymous", title = "{[Advertisement]}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "68--68", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.52", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2009:Ac, author = "Anonymous", title = "{[Advertisement]}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "69--69", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.53", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2009:Ad, author = "Anonymous", title = "{[Advertisement]}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "70--70", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.55", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2009:Ae, author = "Anonymous", title = "{[Advertisement]}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "71--71", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.54", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2009:Af, author = "Anonymous", title = "{[Advertisement]}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "72--72", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.51", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2009:EBCb, author = "Anonymous", title = "Editorial Board [Cover2]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "c2--c2", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.57", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2009:FCb, author = "Anonymous", title = "[{Front} cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "c1--c1", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.56", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2009:IAb, author = "Anonymous", title = "Information for authors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "c3--c3", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.58", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2009:ICSb, author = "Anonymous", title = "{IEEE Computer Society} [Cover4]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "2", pages = "c4--c4", month = jul # "\slash " # dec, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.59", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Patil:2010:URT, author = "Shruti Patil and David J. Lilja", title = "Using Resampling Techniques to Compute Confidence Intervals for the Harmonic Mean of Rate-Based Performance Metrics", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "1", pages = "1--4", month = jan # "\slash " # jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.1", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Rate-based metrics such as floating point operations per second, instructions per cycle and so forth are commonly used to measure computer performance. In addition to the average or mean performance of the metric, indicating the precision of the mean using confidence intervals helps to make informed decisions and comparisons with the data. In this paper, we discuss the determination of confidence intervals for the harmonic mean of rate-based metrics using two statistical resampling techniques Jackknife and Bootstrap. We show using Monte Carlo simulations that resampling indeed works as expected, and can be used for generating confidence intervals for harmonic mean.", acknowledgement = ack-nhfb, affiliation = "Patil, S (Reprint Author), Univ Minnesota Twin Cities, Dept Elect \& Comp Engn, St Paul, MN USA. Patil, Shruti; Lilja, David J., Univ Minnesota Twin Cities, Dept Elect \& Comp Engn, St Paul, MN USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "731BP", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Science Foundation [CCF-0541162]", funding-text = "This work was supported in part by the National Science Foundation grant no. CCF-0541162. Any opinions, findings and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of the NSF. The authors also thank the University of Minnesota Statistical Consulting Service for their helpful insights.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Arithmetic; bootstrap; bootstrap technique; Cities and towns; Computer errors; Computer performance; computer performance measurement; Confidence intervals; confidence intervals; Electric variables measurement; Equations; floating point operations; Harmonic analysis; harmonic mean; jackknife; jackknife technique; Monte Carlo methods; Monte Carlo simulations; Nonparametric statistics; Performance analysis; performance evaluation; Performance of Systems; Probability distribution; rate-based performance metrics; resampling; statistical analysis; statistical resampling techniques; Statistics", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "9", unique-id = "Patil:2010:URT", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Seznec:2010:PCM, author = "Andre Seznec", title = "A Phase Change Memory as a Secure Main Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "1", pages = "5--8", month = jan # "\slash " # jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.2", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/prng.bib", abstract = "Phase change memory (PCM) technology appears as more scalable than DRAM technology. As PCM exhibits access time slightly longer but in the same range as DRAMs, several recent studies have proposed to use PCMs for designing main memory systems. Unfortunately PCM technology suffers from a limited write endurance; typically each memory cell can be only be written a large but still limited number of times (10(7) to 10(9) writes are reported for current technology). Till now, research proposals have essentially focused their attention on designing memory systems that will survive to the average behavior of conventional applications. However PCM memory systems should be designed to survive worst-case applications, i.e., malicious attacks targeting the physical destruction of the memory through overwriting a limited number of memory cells.", acknowledgement = ack-nhfb, affiliation = "Seznec, A (Reprint Author), INRIA Rennes Bretagne Atlantique, Ctr Rech, Campus Beaulieu, F-35042 Rennes, France. INRIA Rennes Bretagne Atlantique, Ctr Rech, F-35042 Rennes, France.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "seznec@irisa.fr", da = "2019-06-20", doc-delivery-number = "731BP", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "European Commission [27648]", funding-text = "This work was partially supported by the European Commission in the context of the SARC integrated project \#27648 (FP6).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Application software; DRAM technology; Energy consumption; memory cells; Memory Structures; PCM memory systems; Phase change materials; phase change memories; phase change memory; Phase change memory; Physics computing; Proposals; Random access memory; Random number generation; Random processes; Scalability; secure PCM-based main memory; Semiconductor Memories", keywords-plus = "TECHNOLOGY", number-of-cited-references = "8", oa = "Green Published", research-areas = "Computer Science", times-cited = "17", unique-id = "Seznec:2010:PCM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Park:2010:EIP, author = "Seon-yeong Park and Euiseong Seo and Ji-Yong Shin and Seungryoul Maeng and Joonwon Lee", title = "Exploiting Internal Parallelism of Flash-based {SSDs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "1", pages = "9--12", month = jan # "\slash " # jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.3", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "For the last few years, the major driving force behind the rapid performance improvement of SSDs has been the increment of parallel bus channels between a flash controller and flash memory packages inside the solid-state drives (SSDs). However, there are other internal parallelisms inside SSDs yet to be explored. In order to improve performance further by utilizing the parallelism, this paper suggests request rescheduling and dynamic write request mapping. Simulation results with real workloads have shown that the suggested schemes improve the performance of the SSDs by up to 15\% without any additional hardware support.", acknowledgement = ack-nhfb, affiliation = "Park, SY (Reprint Author), Korea Adv Inst Sci \& Technol, Taejon, South Korea. Park, Seon-yeong; Shin, Ji-Yong; Maeng, Seungryoul, Korea Adv Inst Sci \& Technol, Taejon, South Korea. Seo, Euiseong, Ulsan Natl Inst Sci \& Technol, Ulsan, South Korea. Lee, Joonwon, Sungkyunkwan Univ, Seoul, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "731BP", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Korea government(MEST) [2009-0080381]", funding-text = "This work was supported by the Korea Science and Engineering Foundation (KOSEF) grant funded by the Korea government (MEST), (No. 2009-080381)", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Delay; Drives; exploiting internal parallelism; flash based SSD; flash controller; flash memories; Flash memory; flash memory packages; Force control; Hard disks; I/O scheduling; Input/Output Devices; Packaging; parallel bus channels; parallel processing; Parallel systems; parallelism; pipeline processing; Pipeline processing; Secondary storage; Simulation; Solid state circuits; solid state drives; Solid-State Drives (SSDs); Space technology; Storage Management; system buses; Throughput", number-of-cited-references = "6", research-areas = "Computer Science", researcherid-numbers = "Maeng, Seungryoul/C-1882-2011", times-cited = "35", unique-id = "Park:2010:EIP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Subramoni:2010:ISI, author = "Hari Subramoni and Fabrizio Petrini and Virat Agarwal and Davide Pasetto", title = "Intra-Socket and Inter-Socket Communication in Multi-core Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "1", pages = "13--16", month = jan # "\slash " # jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.4", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The increasing computational and communication demands of the scientific and industrial communities require a clear understanding of the performance trade-offs involved in multi-core computing platforms. Such analysis can help application and toolkit developers in designing better, topology aware, communication primitives intended to suit the needs of various high end computing applications. In this paper, we take on the challenge of designing and implementing a portable intra-core communication framework for streaming computing and evaluate its performance on some popular multi-core architectures developed by Intel, AMD and Sun. Our experimental results, obtained on the Intel Nehalem, AMD Opteron and Sun Niagara 2 platforms, show that we are able to achieve an intra-socket small message latency between 120 and 271 nanoseconds, while the inter-socket small message latency is between 218 and 320 nanoseconds. The maximum intra-socket communication bandwidth ranges from 0.179 (Sun Niagara 2) to 6.5 (Intel Nehalem) Gbytes/second. We were also able to obtain an inter-socket communication performance of 1.2 and 6.6 Gbytes/second on the AMD Opteron and Intel Nehalem, respectively.", acknowledgement = ack-nhfb, affiliation = "Subramoni, H (Reprint Author), IBM TJ Watson, Yorktown Hts, NY 10598 USA. Subramoni, Hari; Petrini, Fabrizio; Agarwal, Virat, IBM TJ Watson, Yorktown Hts, NY 10598 USA. Pasetto, Davide, IBM Computat Sci Ctr, Dublin, Ireland. Subramoni, Hari, Ohio State Univ, Columbus, OH 43210 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "subramon@cse.ohio-state.edu fpetrin@us.ibm.com viratagarwal@us.ibm.com pasetto\_davide@ie.ibm.com", da = "2019-06-20", doc-delivery-number = "731BP", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "AMD Opteron; Bandwidth; Communication industry; communication primitives; Communication Protocols; Computer applications; Computer architecture; Computer industry; Delay; General; Hardware; High Performance Computing; industrial communities; Intel Nehalem; intersocket communication; Intrasocket communication; multicore architectures; Multicore Processors; multicore systems; multiprocessing systems; parallel architectures; Performance of Systems; Portable computers; streaming computing; Sun; toolkit developers; Topology; topology aware", keywords-plus = "NETWORK", number-of-cited-references = "8", research-areas = "Computer Science", times-cited = "6", unique-id = "Subramoni:2010:ISI", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Hoang:2010:CAN, author = "Giang Hoang and Chang Bae and John Lange and Lide Zhang and Peter Dinda and Russ Joseph", title = "A Case for Alternative Nested Paging Models for Virtualized Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "1", pages = "17--20", month = jan # "\slash " # jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.6", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Address translation often emerges as a critical performance bottleneck for virtualized systems and has recently been the impetus for hardware paging mechanisms. These mechanisms apply similar translation models for both guest and host address translations. We make an important observation that the model employed to translate from guest physical addresses (GPAs) to host physical addresses (HPAs) is in fact orthogonal to the model used to translate guest virtual addresses (GVAs) to GPAs. Changing this model requires VMM cooperation, but has no implications for guest OS compatibility. As an example, we consider a hashed page table approach for GPA -> HPA translation. Nested paging, widely considered the most promising approach, uses unhashed multi-level forward page tables for both GVA -> GPA and GPA -> HPA translations, resulting in a potential O(n(2)) page walk cost on a TLB miss, for n-level page tables. In contrast, the hashed page table approach results in an expected O(n) cost. Our simulation results show that when a hashed page table is used in the nested level, the performance of the memory system is not worse, and sometimes even better than a nested forward-mapped page table due to reduced page walks and cache pressure. This showcases the potential for alternative paging mechanisms.", acknowledgement = ack-nhfb, affiliation = "Hoang, GA (Reprint Author), Northwestern Univ, Evanston, IL 60208 USA. Hoang, Giang; Bae, Chang; Lange, John; Dinda, Peter; Joseph, Russ, Northwestern Univ, Evanston, IL 60208 USA. Zhang, Lide, Univ Michigan, Ann Arbor, MI 48109 USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "731BP", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "address translation; Computer Architecture; Computer architecture; Computer Architecture; Computer displays; Control systems; Costs; Emerging technologies; file organisation; guest physical addresses; guest virtual addresses; Hardware; hardware paging mechanisms; Hardware/software interfaces; host physical addresses; Instruction sets; Nested Paging; nested paging models; Operating systems; OS compatibility; paged storage; Platform virtualization; Software performance; storage allocation; unhashed multilevel forward page tables; virtual machine monitors; Virtual machine monitors; virtual machines; Virtual Memory; Virtualization; virtualized systems; VMM cooperation", number-of-cited-references = "11", research-areas = "Computer Science", researcherid-numbers = "Joseph, Russell/B-7230-2009 Dinda, Peter/B-7142-2009", times-cited = "5", unique-id = "Hoang:2010:CAN", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Krimer:2010:SNT, author = "Evgeni Krimer and Robert Pawlowski and Mattan Erez and Patrick Chiang", title = "{Synctium}: a Near-Threshold Stream Processor for Energy-Constrained Parallel Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "1", pages = "21--24", month = jan # "\slash " # jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.5", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "While Moore's law scaling continues to double transistor density every technology generation, supply voltage reduction has essentially stopped, increasing both power density and total energy consumed in conventional microprocessors. Therefore, future processors will require an architecture that can: (a) take advantage of the massive amount of transistors that will be available; and (b) operate these transistors in the near-threshold supply domain, thereby achieving near optimal energy/computation by balancing the leakage and dynamic energy consumption. Unfortunately, this optimality is typically achieved while running at very low frequencies (i.e., 0.1--10MHz) and with only one computation executing per cycle, such that performance is limited. Further, near-threshold designs suffer from severe process variability that can introduce extremely large delay variations. In this paper, we propose a near energy-optimal, stream processor family that relies on massively parallel, near-threshold VLSI circuits and interconnect, incorporating cooperative circuit/architecture techniques to tolerate the expected large delay variations. Initial estimations from circuit simulations show that it is possible to achieve greater than 1 Giga-Operations per second (1GOP/s) with less than 1mW total power consumption, enabling a new class of energy-constrained, high-throughput computing applications.", acknowledgement = ack-nhfb, affiliation = "Krimer, E (Reprint Author), UT Austin, ECE, Austin, TX USA. Krimer, Evgeni; Erez, Mattan, UT Austin, ECE, Austin, TX USA. Pawlowski, Robert; Chiang, Patrick, Oregon State Univ, EECS, Corvallis, OR 97331 USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "731BP", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Circuits; Computer architecture; conventional microprocessors; Delay; double transistor density; dynamic energy consumption; energy constrained parallel applications; Energy consumption; etc.; Frequency; impact of technology trends; Low-power design; Microprocessors; Mobile processors; Moore's Law; near threshold stream processor; optimisation; parallel programming; Physically aware micro-architecture: power; pipeline processing; Power generation; SIMD processors; supply voltage reduction; Synctium; thermal; Very large scale integration; VLSI circuits; Voltage", keywords-plus = "CIRCUITS; TOLERANCE; CMOS", number-of-cited-references = "19", oa = "Green Published", research-areas = "Computer Science", times-cited = "22", unique-id = "Krimer:2010:SNT", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Hilton:2010:SDE, author = "Andrew Hilton and Amir Roth", title = "{SMT-Directory}: Efficient Load-Load Ordering for {SMT}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "1", pages = "25--28", month = jan # "\slash " # jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.8", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Memory models like SC, TSO, and PC enforce load-load ordering, requiring that loads from any single thread appear to occur in program order to all other threads. Out-of-order execution can violate load-load ordering. Conventional multi-processors with out-of-order cores detect load-load ordering violations by snooping an age-ordered load queue on cache invalidations or evictions-events that act as proxies for the completion of remote stores. This mechanism becomes less efficient in an SMT processor, as every completing store must search the loads queue segments of all other threads. This inefficiency exists because store completions from other threads in the same core are not filtered by the cache and coherence protocol: thread 0 observes all of thread 1's stores, not only the first store to every cache line. SMT-Directory eliminates this overhead by implementing the filtering traditionally provided by the cache in the cache itself. SMT-Directory adds a per-thread ``read'' bit to every data cache line. When a load executes, it sets the bit corresponding to its thread. When a store completes and write to the cache, it checks the SMT-Directory bits of its cache line and searches the load queue segments only of those threads whose bits are set. As a result, local store completions trigger searches only for data that is actually shared.", acknowledgement = ack-nhfb, affiliation = "Hilton, A (Reprint Author), Univ Penn, Philadelphia, PA 19104 USA. Hilton, Andrew; Roth, Amir, Univ Penn, Philadelphia, PA 19104 USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "731BP", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [CCF-0541292]", funding-text = "We thank Arun Raghavan for the address traces and Milo Martin for comments on early versions of this work. The anonymous reviewers provided valuable feedback. This work was supported by NSF award CCF-0541292.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "age-ordered load queue; Buffer storage; cache invalidations; cache protocol; cache storage; coherence protocol; consistency models; data cache line; directory; Filtering; Load modeling; load queue search; load queue segments; load-load ordering; Memory hierarchy; multi-threading; multiprocessing systems; Multithreaded processors; Multithreading; Out of order; Protocols; Read-write memory; Simultaneous multithreading; SMT processor; Surface-mount technology; Writing", keywords-plus = "CONSISTENCY", number-of-cited-references = "9", research-areas = "Computer Science", times-cited = "0", unique-id = "Hilton:2010:SDE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Hammoud:2010:DPA, author = "Mohammad Hammoud and Sangyeun Cho and Rami G. Melhem", title = "A Dynamic Pressure-Aware Associative Placement Strategy for Large Scale Chip Multiprocessors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "1", pages = "29--32", month = jan # "\slash " # jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.7", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper describes dynamic pressure-aware associative placement (DPAP), a novel distributed cache management scheme for large-scale chip multiprocessors. Our work is motivated by the large non-uniform distribution of memory accesses across cache sets in different L2 banks. DPAP decouples the physical locations of cache blocks from their addresses for the sake of reducing misses caused by destructive interferences. Temporal pressure at the on-chip last-level cache, is continuously collected at a group (comprised of local cache sets) granularity, and periodically recorded at the memory controller(s) to guide the placement process. An incoming block is consequently placed at a cache group that exhibits the minimum pressure. Simulation results using a full-system simulator demonstrate that DPAP outperforms the baseline shared NUCA scheme by an average of 8.3\% and by as much as 18.9\% for the benchmark programs we examined. Furthermore, evaluations showed that DPAP outperforms related cache designs.", acknowledgement = ack-nhfb, affiliation = "Hammoud, M (Reprint Author), Univ Pittsburgh, Dept Comp Sci, Pittsburgh, PA 15260 USA. Hammoud, Mohammad; Cho, Sangyeun; Melhem, Rami G., Univ Pittsburgh, Dept Comp Sci, Pittsburgh, PA 15260 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "mhh@cs.pitt.edu cho@cs.pitt.edu melhem@cs.pitt.edu", da = "2019-06-20", doc-delivery-number = "731BP", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [CCF-0952273]", funding-text = "This work was supported in part by NSF grant CCF-0952273.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Aggregate Cache Sets; Aggregates; Associative Placement; cache storage; Chip Multiprocessors; Computer architecture; Computer science; destructive interferences; distributed cache management; DPAP; dynamic pressure aware associative placement strategy; Interference; large scale chip multiprocessors; Large-scale systems; Local Cache Sets; memory access distribution; memory controllers; microprocessor chips; Network-on-a-chip; NUCA scheme; Pressure control; Pressure-Aware Placement; Random access memory", number-of-cited-references = "19", research-areas = "Computer Science", times-cited = "4", unique-id = "Hammoud:2010:DPA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kim:2010:LUC, author = "Hyungjun Kim and Paul V. Gratz", title = "Leveraging Unused Cache Block Words to Reduce Power in {CMP} Interconnect", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "1", pages = "33--36", month = jan # "\slash " # jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.9", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Power is of paramount importance in modern computer system design. In particular, the cache interconnect in future CMP designs is projected to consume up to half of the system power for cache fills and spills [8]. Despite the power consumed by spills and fills, a significant percentage of each cache line is unused prior to eviction from the cache. If unused cache block words can be identified, this information can be used to improve CMP interconnect power and energy consumption. We propose a new method of CMP interconnect packet composition, leveraging unused data to reduce power. These methods are well suited to interconnection networks with high-bandwidth wires, and do not require expensive multi-ported memory systems. Assuming perfect prediction, our techniques achieve an average of similar to 37\% savings in total dynamic link power consumption. With our current best prediction mechanism, our techniques reduce dynamic power consumption by similar to 23\% on average.", acknowledgement = ack-nhfb, affiliation = "Kim, H (Reprint Author), Texas A\&M Univ, Dept Elect \& Comp Engn, College Stn, TX 77843 USA. Kim, Hyungjun; Gratz, Paul V., Texas A\&M Univ, Dept Elect \& Comp Engn, College Stn, TX 77843 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "hyungjuk@tamu.edu pgratz@tamu.edu", da = "2019-06-20", doc-delivery-number = "731BP", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; cache fills; cache interconnect; Cache memories; cache spills; cache storage; CMP interconnect; computer system design; Delay; dynamic power; Energy consumption; energy consumption; flit encoding; integrated circuit design; Interconnection architectures; Low-power design; memory system; microprocessor chips; Multicore; Multiprocessor interconnection networks; Network-on-a-chip; NoC; power aware computing; Power engineering computing; power reduction; Power system interconnection; Random access memory; total dynamic link power consumption; unused cache block words; Very large scale integration; Wires", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "0", unique-id = "Kim:2010:LUC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2010:EBCa, author = "Anonymous", title = "Editorial Board [Cover2]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "1", pages = "c2--c2", month = jan # "\slash " # jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.11", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2010:FCa, author = "Anonymous", title = "[{Front} cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "1", pages = "c1--c1", month = jan # "\slash " # jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.10", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2010:IAa, author = "Anonymous", title = "Information for authors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "1", pages = "c3--c3", month = jan # "\slash " # jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.12", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2010:ICSa, author = "Anonymous", title = "{IEEE Computer Society} [Cover4]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "1", pages = "c4--c4", month = jan # "\slash " # jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.13", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Skadron:2010:ELE, author = "K. Skadron", title = "Editorial: Letter from the {Editor-in-Chief}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "37--44", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.27", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Skadron:2010:U, author = "Kevin Skadron", title = "Untitled", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "37--44", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.27", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "731BX", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", number-of-cited-references = "0", research-areas = "Computer Science", times-cited = "0", unique-id = "Skadron:2010:U", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Iqbal:2010:POS, author = "Syed Muhammad Zeeshan Iqbal and Yuchen Liang and Hakan Grahn", title = "{ParMiBench} --- an Open-Source Benchmark for Embedded Multiprocessor Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "45--48", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.14", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/gnu.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Multicore processors are the main computing platform in laptops, desktop, and servers today, and are making their way into the embedded systems market also. Using benchmarks is a common approach to evaluate the performance of a system. However, benchmarks for embedded systems have so far been either targeted for a uni-processor environment, e.g., MiBench, or have been commercial, e.g., MultiBench by EEMBC. In this paper, we propose and implement an open source benchmark, ParMiBench, targeted for multiprocessor-based embedded systems. ParMiBench consists of parallel implementations of seven compute intensive algorithms from the uni-processor benchmark suite MiBench. The applications are selected from four domains: Automation and Industry Control, Network, Office, and Security.", acknowledgement = ack-nhfb, affiliation = "Iqbal, SMZ (Reprint Author), Blekinge Inst Technol, Sch Comp, SE-37179 Karlskrona, Sweden. Iqbal, Syed Muhammad Zeeshan; Liang, Yuchen; Grahn, Hakan, Blekinge Inst Technol, Sch Comp, SE-37179 Karlskrona, Sweden.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "mzeeshan01@gmail.com yuchen9760@gmail.com hakan.grahn@bth.se", da = "2019-06-20", doc-delivery-number = "731BX", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "benchmark testing; Benchmark testing; Concurrent Programming; desktop; embedded multiprocessor system; Embedded system; embedded system market; embedded systems; intensive algorithm; laptop; Load management; Multicore processing; multiprocessing systems; Multiprocessor Systems; open-source benchmark; parallel architectures; parallel implementation; ParMiBench; Performance Evaluation; Performance evaluation; Performance Evaluation; Program processors; public domain software; Security; uniprocessor benchmark suite", number-of-cited-references = "9", ORCID-numbers = "Grahn, Hakan/0000-0001-9947-1088", research-areas = "Computer Science", researcherid-numbers = "Grahn, Hakan/G-9720-2011", times-cited = "32", unique-id = "Iqbal:2010:POS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Fang:2010:BRP, author = "Zhen Fang and Erik G. Hallnor and Bin Li and Michael Leddige and Donglai Dai and Seung Eun Lee and Srihari Makineni and Ravi Iyer", title = "{Boomerang}: Reducing Power Consumption of Response Packets in {NoCs} with Minimal Performance Impact", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "49--52", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.15", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Most power reduction mechanisms for NoC channel buffers rely on on-demand wakeup to transition from a low-power state to the active state. Two drawbacks of on-demand wakeup limit its effectiveness: (1) performance impact caused by wakeup delays, and (2) energy and area cost of sleep circuitry itself. What makes the problem harder to solve is that solutions to either problem tend to exacerbate the other. For example, faster wakeup from a power-gated state requires greater charge/discharge current for the sleep transistors while using nimbler sleep transistors implies long wakeup delays. As a result, powerdowns have to be conservatively prescribed, missing many power-saving opportunities. We propose Boomerang, a novel power-saving method that overcomes the above drawbacks. Specifically, based on the observation that a response is always preceded by a request, we let the request trigger wakeup of the buffer that is to be used by its response in the ( near) future, instead of using on-demand wakeups. Hiding the wakeup delay completely, Boomerang allows us to employ aggressive sleep policies and use low-cost power gating circuits on response buffers.", acknowledgement = ack-nhfb, affiliation = "Fang, Z (Reprint Author), Intel Corp, Santa Clara, CA 95051 USA. Fang, Zhen; Hallnor, Erik G.; Li, Bin; Leddige, Michael; Dai, Donglai; Makineni, Srihari; Iyer, Ravi, Intel Corp, Santa Clara, CA 95051 USA. Lee, Seung Eun, Seoul Natl Univ Sci \& Technol, Seoul, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "zhen.fang@intel.com", da = "2019-06-20", doc-delivery-number = "731BX", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Boomerang; buffer circuits; charge-discharge current; Delay; Interconnection networks; Leakage currents; leakage power; low-cost power gating circuits; low-power design; Mobile communication; network-on-chip; nimbler sleep transistors; NoC channel buffers; packet-switching networks; power aware computing; power consumption reduction mechanism; power-gated state; power-saving method; response packets; Routing; Switches; System-on-a-chip; Transistors; wakeup delay", number-of-cited-references = "8", research-areas = "Computer Science", times-cited = "1", unique-id = "Fang:2010:BRP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Lyons:2010:ASF, author = "Michael J. Lyons and Mark Hempstead and Gu-Yeon Wei and David Brooks", title = "The Accelerator Store framework for high-performance, low-power accelerator-based systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "53--56", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.16", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Hardware acceleration can increase performance and reduce energy consumption. To maximize these benefits, accelerator-based systems that emphasize computation on accelerators (rather than on general purpose cores) should be used. We introduce the ``accelerator store,'' a structure for sharing memory between accelerators in these accelerator-based systems. The accelerator store simplifies accelerator I/O and reduces area by mapping memory to accelerators when needed at runtime. Preliminary results demonstrate a 30\% system area reduction with no energy overhead and less than 1\% performance overhead in contrast to conventional DMA schemes.", acknowledgement = ack-nhfb, affiliation = "Lyons, MJ (Reprint Author), Harvard Univ, Sch Engn \& Appl Sci, Cambridge, MA 02138 USA. Lyons, Michael J.; Brooks, David, Harvard Univ, Sch Engn \& Appl Sci, Cambridge, MA 02138 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "mjlyons@eecs.harvard.edu mhempstead@coe.drexel.edu guyeon@eecs.harvard.edu dbrooks@eecs.harvard.edu", da = "2019-06-20", doc-delivery-number = "731BX", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Science Foundation [IIS-0926148]; Gigascale Systems Research Center", funding-text = "This material is based upon work supported by the National Science Foundation under Grant No. IIS-0926148. The authors acknowledge the support of the Gigascale Systems Research Center, one of six research centers funded under the Focus Center Research Program (FCRP), a Semiconductor Research Corporation entity.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Acceleration; accelerator store framework; energy consumption; General; hardware acceleration; Heterogeneous (hybrid) systems; high-performance low-power accelerator-based system; low-power electronics; memory architecture; Memory management; memory mapping; memory sharing; Program processors; Random access memory; Real time systems; Real-time and embedded systems; shared memory systems; storage management; Throughput; Transform coding", number-of-cited-references = "10", research-areas = "Computer Science", times-cited = "13", unique-id = "Lyons:2010:ASF", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Manevich:2010:CAR, author = "Ran Manevich and Israel Cidon and Avinoam Kolodny and Isask'har Walter", title = "Centralized Adaptive Routing for {NoCs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "57--60", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.17", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "As the number of applications and programmable units in CMPs and MPSoCs increases, the Network-on-Chip (NoC) encounters diverse and time dependent traffic loads. This trend motivates the introduction of NoC load-balanced, adaptive routing mechanisms that achieve higher throughput as compared with traditional oblivious routing schemes that are perceived better suited for hardware implementations. However, an efficient adaptive routing scheme should base its decisions on the global state of the system rather than on local or regional congestion signals as is common in current adaptive routing schemes. In this paper we introduce a novel paradigm of NoC centralized adaptive routing, and a specific design for mesh topology. Our scheme continuously monitors the global traffic load in the network and modifies the routing of packets to improve load balancing accordingly. In our specific mesh-based design, XY or YX routes are adaptively selected for each source-destination pair. We show that while our implementation is scalable and lightweight in hardware costs, it outperforms distributed adaptive routing schemes in terms of load balancing and throughput.", acknowledgement = ack-nhfb, affiliation = "Manevich, R (Reprint Author), Technion Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa, Israel. Manevich, Ran; Cidon, Israel; Kolodny, Avinoam; Walter, Isask'har, Technion Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa, Israel.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "ranman@tx.technion.ac.il cidon@ee.technion.ac.il kolodny@ee.technion.ac.il zigi@tx.technion.ac.il", da = "2019-06-20", doc-delivery-number = "731BX", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "adaptive routing; Adaptive systems; centralized adaptive routing; Computer architecture; distributed adaptive routing; global state; load balanced adaptive routing; load balancing; Load control; Load management; mesh based design; mesh topology; network on chip; Network on Chip; network routing; Network-on-Chip; network-on-chip; NoC; packet routing; programmable unit; regional congestion signal; routing algorithms; Routing protocols; Telecommunication traffic; Throughput; time dependent traffic load", number-of-cited-references = "7", research-areas = "Computer Science", times-cited = "9", unique-id = "Manevich:2010:CAR", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Zhang:2010:FCA, author = "Meng Zhang and Alvin R. Lebeck and Daniel J. Sorin", title = "Fractal Consistency: Architecting the Memory System to Facilitate Verification", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "61--64", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.18", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "One of the most challenging problems in developing a multicore processor is verifying that the design is correct, and one of the most difficult aspects of pre-silicon verification is verifying that the memory system obeys the architecture's specified memory consistency model. To simplify the process of pre-silicon design verification, we propose a system model called the Fractally Consistent Model (FCM). We prove that systems that adhere to the FCM can be verified to obey the memory consistency model in three simple, scalable steps. The procedure for verifying FCM systems contrasts sharply with the difficult, non-scalable procedure required to verify non-FCM systems. We show that FCM systems do not necessarily sacrifice performance, compared to non-FCM systems, despite being simpler to verify.", acknowledgement = ack-nhfb, affiliation = "Zhang, M (Reprint Author), Duke Univ, Dept Elect \& Comp Engn, Durham, NC 27706 USA. Zhang, Meng; Sorin, Daniel J., Duke Univ, Dept Elect \& Comp Engn, Durham, NC 27706 USA. Lebeck, Alvin R., Duke Univ, Dept Comp Sci, Durham, NC 27706 USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "731BX", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Science Foundation [CCF-0702434, CCF-0811290]", funding-text = "This material is based upon work supported by the National Science Foundation under grants CCF-0702434 and CCF-0811290.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Arithmetic and Logic Structures; Coherence; Computational modeling; Computer architecture; Computer Reliability; Fault-Tolerance; FCM systems; Formal verification; fractal consistent model; Fractals; Hardware; Memory; memory architecture; Memory Consistency; memory consistency model; Memory hierarchy; memory system architecture; Micro-architecture implementation considerations; microprocessor chips; Multicore; multicore processor; multiprocessing systems; Performance Analysis and Design Aids; presilicon verification; Processor Architectures; Protocols; Testing; Validation; Verification", number-of-cited-references = "10", oa = "Green Published", research-areas = "Computer Science", times-cited = "1", unique-id = "Zhang:2010:FCA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2010:AIT, author = "Anonymous", title = "Advertisement --- {{\booktitle{IEEE Transactions on Computers}}} Celebrates 60 Years", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "65--65", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.29", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2010:ICSb, author = "Anonymous", title = "2011 {IEEE Computer Society} Simulator Design Competition", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "66--66", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.19", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2010:ASS, author = "Anonymous", title = "Advertisement --- Special Student Offer", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "67--67", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.30", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2010:ADY, author = "Anonymous", title = "Advertisement --- Distinguish Yourself With the {CSDP}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "68--68", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.26", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2010:CPS, author = "Anonymous", title = "{Conference Proceedings Services (CPS)} [advertisement]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "69--69", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.21", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2010:ICSc, author = "Anonymous", title = "{IEEE Computer Society} Jobs", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "70--70", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.28", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2010:ASC, author = "Anonymous", title = "Advertisement --- Stay Connected to the {IEEE Computer Society}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "71--71", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.31", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2010:ACS, author = "Anonymous", title = "Advertisement --- {Computer Society Digital Library}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "72--72", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.20", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2010:EBCb, author = "Anonymous", title = "Editorial Board [Cover2]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "c2--c2", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.23", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2010:FCb, author = "Anonymous", title = "[{Front} cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "c1--c1", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.22", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2010:IAb, author = "Anonymous", title = "Information for authors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "c3--c3", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.24", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2010:ICSd, author = "Anonymous", title = "{IEEE Computer Society} [Cover4]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "2", pages = "c4--c4", month = jul # "\slash " # dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.25", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Skadron:2011:ELE, author = "K. Skadron", title = "Editorial: Letter from the {Editor-in-Chief}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "1", pages = "1--3", month = jan # "\slash " # jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.13", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Skadron:2011:U, author = "Kevin Skadron", title = "Untitled", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "1", pages = "1--3", month = jan # "\slash " # jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.13", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "773ZN", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", number-of-cited-references = "0", research-areas = "Computer Science", times-cited = "0", unique-id = "Skadron:2011:U", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Vandierendonck:2011:FMM, author = "Hans Vandierendonck and Andre Seznec", title = "Fairness Metrics for Multi-Threaded Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "1", pages = "4--7", month = jan # "\slash " # jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.1", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multi-threaded processors execute multiple threads concurrently in order to increase overall throughput. It is well documented that multi-threading affects per-thread performance but, more importantly, some threads are affected more than others. This is especially troublesome for multi-programmed workloads. Fairness metrics measure whether all threads are affected equally. However defining equal treatment is not straightforward. Several fairness metrics for multi-threaded processors have been utilized in the literature, although there does not seem to be a consensus on what metric does the best job of measuring fairness. This paper reviews the prevalent fairness metrics and analyzes their main properties. Each metric strikes a different trade-off between fairness in the strict sense and throughput. We categorize the metrics with respect to this property. Based on experimental data for SMT processors, we suggest using the minimum fairness metric in order to balance fairness and throughput.", acknowledgement = ack-nhfb, affiliation = "Vandierendonck, H (Reprint Author), Univ Ghent, Dept Elect \& Informat Syst, Ghent, Belgium. Vandierendonck, Hans, Univ Ghent, Dept Elect \& Informat Syst, Ghent, Belgium. Seznec, Andre, INRIA Rennes, Rennes, France.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "hans.vandierendonck@elis.ugent.be Andre.Seznec@inria.fr", da = "2019-06-20", doc-delivery-number = "773ZN", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Correlation; fairness; fairness metrics; Harmonic analysis; Instruction sets; measurement; Measurement; multi-programming; Multi-threaded processors; multi-threading; multiprocessing systems; multiprogrammed workloads; multithreaded processors; Parallel Architectures; Performance of Systems; quality-of-service; resource allocation; SMT processors; software metrics; System-on-a-chip; Throughput", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "13", unique-id = "Vandierendonck:2011:FMM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Tang:2011:PEM, author = "Jie Tang and Shaoshan Liu and Zhimin Gu and Chen Liu and Jean-Luc Gaudiot", title = "Prefetching in Embedded Mobile Systems Can Be Energy-Efficient", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "1", pages = "8--11", month = jan # "\slash " # jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.2", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Data prefetching has been a successful technique in high-performance computing platforms. However, the conventional wisdom is that they significantly increase energy consumption, and thus not suitable for embedded mobile systems. On the other hand, as modern mobile applications pose an increasing demand for high performance, it becomes essential to implement high-performance techniques, such as prefetching, in these systems. In this paper, we study the impact of prefetching on the performance and energy consumption of embedded mobile systems. Contrary to the conventional wisdom, our findings demonstrate that as technology advances, prefetching can be energy-efficient while improving performance. Furthermore, we have developed a simple but effective analytical model to help system designers to identify the conditions for energy efficiency.", acknowledgement = ack-nhfb, affiliation = "Tang, J (Reprint Author), Beijing Inst Technol, Beijing 100081, Peoples R China. Tang, Jie; Gu, Zhimin, Beijing Inst Technol, Beijing 100081, Peoples R China. Liu, Shaoshan, Microsoft Corp, Redmond, WA 98052 USA. Liu, Chen, Florida Int Univ, Miami, FL 33199 USA. Gaudiot, Jean-Luc, Univ Calif Irvine, Irvine, CA USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "tangjie.bit@gmail.com shaoliu@microsoft.com zmgu@x263.net chen.liu@fiu.edu gaudiot@uci.edu", da = "2019-06-20", doc-delivery-number = "773ZN", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "data prefetching; embedded mobile systems; embedded systems; energy consumption; energy efficiency condition; energy-efficient prefetching; high-performance computing platform; Low power electronics; Low-power design; Memory management; Memory Structures; mobile computing; Mobile computing; Mobile Computing; storage management", number-of-cited-references = "11", ORCID-numbers = "Liu, Chen/0000-0003-1558-6836", research-areas = "Computer Science", times-cited = "19", unique-id = "Tang:2011:PEM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Khan:2011:DDC, author = "Omer Khan and Mieszko Lis and Yildiz Sinangil and Srinivas Devadas", title = "{DCC}: a Dependable Cache Coherence Multicore Architecture", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "1", pages = "12--15", month = jan # "\slash " # jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.3", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Cache coherence lies at the core of functionally-correct operation of shared memory multicores. Traditional directory-based hardware coherence protocols scale to large core counts, but they incorporate complex logic and directories to track coherence states. Technology scaling has reached miniaturization levels where manufacturing imperfections, device unreliability and occurrence of hard errors pose a serious dependability challenge. Broken or degraded functionality of the coherence protocol can lead to a non-operational processor or user visible performance loss. In this paper, we propose a dependable cache coherence architecture (DCC) that combines the traditional directory protocol with a novel execution-migration-based architecture to ensure dependability that is transparent to the programmer. Our architecturally redundant execution migration architecture only permits one copy of data to be cached anywhere in the processor: when a thread accesses an address not locally cached on the core it is executing on, it migrates to the appropriate core and continues execution there. Both coherence mechanisms can co-exist in the DCC architecture and we present architectural extensions to seamlessly transition between the directory and execution migration protocols.", acknowledgement = ack-nhfb, affiliation = "Khan, O (Reprint Author), MIT, 77 Massachusetts Ave, Cambridge, MA 02139 USA. Khan, Omer; Lis, Mieszko; Sinangil, Yildiz; Devadas, Srinivas, MIT, Cambridge, MA 02139 USA. Khan, Omer, Univ Massachusetts, Lowell, MA USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "773ZN", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "architecturally redundant execution migration architecture; B.3.4 Reliability, Testing, and Fault-Tolerance; B.8 Performance and Reliability; broken functionality; C.4.b Fault tolerance; cache coherence; cache storage; Coherence; coherence mechanisms; coherence states; DCC architecture; degraded functionality; dependability challenge; Dependable architecture; dependable cache coherence architecture; dependable cache coherence multicore architecture; device unreliability; directory protocol; directory-based hardware coherence protocols; execution-migration-based architecture; functionally-correct operation; Hardware; incorporate complex logic; Instruction sets; large core counts; manufacturing imperfections; memory architecture; memory protocols; microprocessor chips; miniaturization levels; Multicore processing; multicores; nonoperational processor; Protocols; shared memory multicores; shared memory systems; System-on-a-chip; technology scaling; user visible performance loss", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "3", unique-id = "Khan:2011:DDC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Rosenfeld:2011:DCA, author = "Paul Rosenfeld and Elliott Cooper-Balis and Bruce Jacob", title = "{DRAMSim2}: a Cycle Accurate Memory System Simulator", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "1", pages = "16--19", month = jan # "\slash " # jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.4", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "In this paper we present DRAMSim2, a cycle accurate memory system simulator. The goal of DRAMSim2 is to be an accurate and publicly available DDR2/3 memory system model which can be used in both full system and trace-based simulations. We describe the process of validating DRAMSim2 timing against manufacturer Verilog models in an effort to prove the accuracy of simulation results. We outline the combination of DRAMSim2 with a cycle-accurate x86 simulator that can be used to perform full system simulations. Finally, we discuss DRAMVis, a visualization tool that can be used to graph and compare the results of DRAMSim2 simulations.", acknowledgement = ack-nhfb, affiliation = "Rosenfeld, P (Reprint Author), Univ Maryland, Dept Elect \& Comp Engn, College Pk, MD 20742 USA. Rosenfeld, Paul; Cooper-Balis, Elliott; Jacob, Bruce, Univ Maryland, Dept Elect \& Comp Engn, College Pk, MD 20742 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "prosenf1@umd.edu ecc17@umd.edu blj@umd.edu", da = "2019-06-20", doc-delivery-number = "773ZN", eissn = "1556-6064", esi-highly-cited-paper = "Y", esi-hot-paper = "N", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computational modeling; cycle accurate memory system simulator; DDR2/3 memory system model; DRAM; DRAM chips; DRAMSim2 simulation; DRAMSim2 timing; Driver circuits; Hardware design languages; Load modeling; memory architecture; memory cards; Object oriented modeling; Primary memory; Random access memory; Simulation; Timing; trace-based simulation; Verilog model; visualization tool", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "270", unique-id = "Rosenfeld:2011:DCA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Gou:2011:ESH, author = "Chunyang Gou and Georgi N. Gaydadjiev", title = "Exploiting {SPMD} Horizontal Locality", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "1", pages = "20--23", month = jan # "\slash " # jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.5", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "In this paper, we analyze a particular spatial locality case (called horizontal locality) inherent to manycore accelerator architectures employing barrel execution of SPMD kernels, such as GPUs. We then propose an adaptive memory access granularity framework to exploit and enforce the horizontal locality in order to reduce the interferences among accelerator cores memory accesses and hence improve DRAM efficiency. With the proposed technique, DRAM efficiency grows by 1.42X on average, resulting in 12.3\% overall performance gain, for a set of representative memory intensive GPGPU applications.", acknowledgement = ack-nhfb, affiliation = "Gou, C (Reprint Author), Delft Univ Technol, NL-2600 AA Delft, Netherlands. Gou, Chunyang; Gaydadjiev, Georgi N., Delft Univ Technol, NL-2600 AA Delft, Netherlands.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "c.gou@tudelft.nl g.n.gaydadjiev@tudelft.nl", da = "2019-06-20", doc-delivery-number = "773ZN", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "accelerator core memory access; adaptive memory access granularity; Bandwidth; barrel execution; DRAM chips; DRAM efficiency; GPU; Graphics processing unit; Instruction sets; interference; Kernel; manycore accelerator architecture; Memory hierarchy; microprocessor chips; Multi-core/single-chip multiprocessors; parallel architectures; Pipelines; Proposals; Random access memory; SIMD processors; single program multiple data; spatial locality; SPMD horizontal locality; SPMD kernel", number-of-cited-references = "13", ORCID-numbers = "Gaydadjiev, Georgi/0000-0002-3678-7007", research-areas = "Computer Science", researcherid-numbers = "Gaydadjiev, Georgi/F-1488-2010", times-cited = "1", unique-id = "Gou:2011:ESH", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Wang:2011:GGC, author = "Xiaoqun Wang and Zhenzhou Ji and Chen Fu and Mingzeng Hu", title = "{GCMS}: a Global Contention Management Scheme in Hardware Transactional Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "1", pages = "24--27", month = jan # "\slash " # jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.6", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Hardware Transactional Memory (HTM) is a promising Transactional Memory (TM) implementation because of its strong atomicity and high performance. Unfortunately, most contention management approaches in HTMs are dedicated to specific transaction conflict scenarios and it is hard to choose a universal strategy for different workloads. In addition, HTM performance degrades sharply when there are severe transaction conflicts. In this paper, we present a Global Contention Management Scheme (GCMS) to resolve severe transaction conflicts in HTMs. Our scheme depends on a Deadlock and Livelock Detection Mechanism (DLDM) and a Global Contention Manager (GCM) to resolve severe transaction conflicts. This scheme is orthogonal to the rest of the contention management policies. We have incorporated GCMS into different HTMs and compared the performance of the enhanced systems with that of the original HTMs with the STAMP benchmark suite. The results demonstrate that the performance of the enhanced HTMs is improved.", acknowledgement = ack-nhfb, affiliation = "Wang, XQ (Reprint Author), Harbin Inst Technol, Sch Comp Sci, Harbin 150006, Peoples R China. Wang, Xiaoqun; Ji, Zhenzhou; Fu, Chen; Hu, Mingzeng, Harbin Inst Technol, Sch Comp Sci, Harbin 150006, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "wxiaoqun@gmail.com", da = "2019-06-20", doc-delivery-number = "773ZN", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bioinformatics; Concurrent Programming; Contention Management; deadlock-and-livelock detection mechanism; GCMS scheme; Genomics; global contention management scheme; global contention manager; Hardware; Hardware Transactional Memory; hardware transactional memory; Multi-core/single-chip multiprocessors; Multicore Processors; Parallel Programming; Program processors; Radiation detectors; storage management; System recovery; transaction conflict; transaction processing", number-of-cited-references = "14", research-areas = "Computer Science", times-cited = "1", unique-id = "Wang:2011:GGC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2011:RL, author = "Anonymous", title = "2010 Reviewers List", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "1", pages = "28--28", month = jan # "\slash " # jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.8", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "IEEE publishing", } @Article{Anonymous:2011:AI, author = "Anonymous", title = "2010 Annual Index", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "1", pages = "??--??", month = jan # "\slash " # jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.7", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2011:Ca, author = "Anonymous", title = "Cover 2", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "1", pages = "c2--c2", month = jan # "\slash " # jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.10", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2011:Cb, author = "Anonymous", title = "Cover 3", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "1", pages = "c3--c3", month = jan # "\slash " # jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.11", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2011:Cc, author = "Anonymous", title = "Cover 4", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "1", pages = "c4--c4", month = jan # "\slash " # jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.12", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2011:FCa, author = "Anonymous", title = "[{Front} cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "1", pages = "c1--c1", month = jan # "\slash " # jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.9", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Mars:2011:HHW, author = "Jason Mars and Lingjia Tang and Robert Hundt", title = "Heterogeneity in {``Homogeneous''} Warehouse-Scale Computers: a Performance Opportunity", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "2", pages = "29--32", month = jul # "\slash " # dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.14", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The class of modern datacenters recently coined as ``warehouse scale computers'' (WSCs) has traditionally been embraced as homogeneous computing platforms. However, due to frequent machine replacements and upgrades, modern WSCs are in fact composed of diverse commodity microarchitectures and machine configurations. Yet, current WSCs are designed with an assumption of homogeneity, leaving a potentially significant performance opportunity unexplored. In this paper, we investigate the key factors impacting the available heterogeneity in modern WSCs, and the benefit of exploiting this heterogeneity to maximize overall performance. We also introduce a new metric, opportunity factor, which can be used to quantify an application's sensitivity to the heterogeneity in a given WSC. For applications that are sensitive to heterogeneity, we observe a performance improvement of up to 70\% when employing our approach. In a WSC composed of state-of-the-art machines, we can improve the overall performance of the entire datacenter by 16\% over the status quo.", acknowledgement = ack-nhfb, affiliation = "Mars, J (Reprint Author), Univ Virginia, Charlottesville, VA 22903 USA. Mars, Jason; Tang, Lingjia, Univ Virginia, Charlottesville, VA 22903 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "jom5x@cs.virginia.edu lt8f@cs.virginia.edu rhundt@google.com", da = "2019-06-20", doc-delivery-number = "855NW", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; Computer architecture; computer centres; datacenters; Design studies; Distributed architectures; diverse commodity microarchitectures; Heterogeneous (hybrid) systems; homogeneous warehouse-scale computers; integration and modeling; machine configurations; mainframes; Microarchitecture; Optimization; Scheduling and task partitioning; Super (very large) computers; System architectures", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "22", unique-id = "Mars:2011:HHW", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Michelogiannakis:2011:PCE, author = "George Michelogiannakis and Nan Jiang and Daniel U. Becker and William J. Dally", title = "Packet Chaining: Efficient Single-Cycle Allocation for On-Chip Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "2", pages = "33--36", month = jul # "\slash " # dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.15", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper introduces packet chaining, a simple and effective method to increase allocator matching efficiency and hence network performance, particularly suited to networks with short packets and short cycle times. Packet chaining operates by chaining packets destined to the same output together, to reuse the switch connection of a departing packet. This allows an allocator to build up an efficient matching over a number of cycles, like incremental allocation, but not limited by packet length. For a 64-node 2D mesh at maximum injection rate and with single-flit packets, packet chaining increases network throughput by 15\% compared to a conventional single-iteration separable iSLIP allocator, outperforms a wavefront allocator, and gives comparable throughput with an augmenting paths allocator. Packet chaining achieves this performance with a cycle time comparable to a single-iteration separable allocator. Packet chaining also reduces average network latency by 22.5\% compared to iSLIP. Finally, packet chaining increases IPC up to 46\% (16\% average) for application benchmarks because short packets are critical in a typical cache-coherent CMP. These are considerable improvements given the maturity of network-on-chip routers and allocators.", acknowledgement = ack-nhfb, affiliation = "Michelogiannakis, G (Reprint Author), Stanford Univ, Stanford, CA 94305 USA. Michelogiannakis, George; Jiang, Nan; Becker, Daniel U.; Dally, William J., Stanford Univ, Stanford, CA 94305 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "mihelog@stanford.edu njiang37@stanford.edu dub@stanford.edu dally@stanford.edu", da = "2019-06-20", doc-delivery-number = "855NW", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Science Foundation [CCF-0702341]; National Security Agency [H98230-08-C-0272-P007]; Robert Bosch Fellowship; Prof. Michael Farmwald Fellowship; Prof. Michael J. Flynn Stanford Graduate Fellowship", funding-text = "This work was supported in part by the National Science Foundation under Grant CCF-0702341, in part by the National Security Agency under Contract H98230-08-C-0272-P007 and in part by the Robert Bosch, Prof. Michael Farmwald and Prof. Michael J. Flynn Stanford Graduate Fellowships.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "allocator matching efficiency; Benchmark testing; Interconnection architectures; network performance; network-on-chip; network-on-chip routers; On-chip interconnection networks; on-chip networks; packet chaining; Resource management; single-iteration separable iSLIP allocator; System-on-a-chip; Throughput", number-of-cited-references = "9", research-areas = "Computer Science", times-cited = "1", unique-id = "Michelogiannakis:2011:PCE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Ho:2011:EIB, author = "Chen-Han Ho and Garret Staus and Aaron Ulmer and Karthikeyan Sankaralingam", title = "Exploring the Interaction Between Device Lifetime Reliability and Security Vulnerabilities", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "2", pages = "37--40", month = jul # "\slash " # dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.16", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "As technology scales, device reliability is becoming a fundamental problem. Even though manufacture test can guarantee product quality, due to various types of wearout and failure modes, permanent faults appearing in the filed is becoming an increasingly important and real problem. Such types of wear-out creates permanent faults in devices after release to the user during their lifetime. In this paper, we perform a formal investigation of the impact of permanent faults on security, examine empirical evidence, and demonstrate a real attack. Our results show that permanent stuck-at faults may leave security holes in microprocessors. We show that an adversary with knowledge of a fault can launch attacks which can obtain critical secrets such as a private key in 30 seconds.", acknowledgement = ack-nhfb, affiliation = "Ho, CH (Reprint Author), Univ Wisconsin, Madison, WI 53706 USA. Ho, Chen-Han; Staus, Garret; Ulmer, Aaron; Sankaralingam, Karthikeyan, Univ Wisconsin, Madison, WI 53706 USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "855NW", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Arithmetic and Logic Structures; Circuit faults; Computer bugs; Control Structures and Microprogramming; Cryptography; device lifetime reliability; failure mode; fault tolerant computing; Hardware reliability; Logic programming; microprocessor chips; microprocessors; Permanent Fault; permanent fault; private key; product quality; Program processors; public key cryptography; Reliability; Reliability engineering; Security; security vulnerability; wear-out type; wearout mode", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "0", unique-id = "Ho:2011:EIB", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Hernandez:2011:FTV, author = "Carles Hernandez and Antoni Roca and Jose Flich and Federico Silla and Jose Duato", title = "Fault-Tolerant Vertical Link Design for Effective {3D} Stacking", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "2", pages = "41--44", month = jul # "\slash " # dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.17", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Recently, 3D stacking has been proposed to alleviate the memory bandwidth limitation arising in chip multiprocessors (CMPs). As the number of integrated cores in the chip increases the access to external memory becomes the bottleneck, thus demanding larger memory amounts inside the chip. The most accepted solution to implement vertical links between stacked dies is by using Through Silicon Vias (TSVs). However, TSVs are exposed to misalignment and random defects compromising the yield of the manufactured 3D chip. A common solution to this problem is by over-provisioning, thus impacting on area and cost. In this paper, we propose a fault-tolerant vertical link design. With its adoption, fault-tolerant vertical links can be implemented in a 3D chip design at low cost without the need of adding redundant TSVs (no over-provision). Preliminary results are very promising as the fault-tolerant vertical link design increases switch area only by 6.69\% while the achieved interconnect yield tends to 100\%.", acknowledgement = ack-nhfb, affiliation = "Hernandez, C (Reprint Author), Univ Politecn Valencia, C Cami de Vera S-N, Valencia 46022, Spain. Hernandez, Carles; Roca, Antoni; Flich, Jose; Silla, Federico; Duato, Jose, Univ Politecn Valencia, Valencia 46022, Spain.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "carherlu@gap.upv.es", da = "2019-06-20", doc-delivery-number = "855NW", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Spanish MEC; MICINN; European Commission [CSD2006-00046, TIN2009-14475-C04]; NaNoC [248972]", funding-text = "This work was supported by the Spanish MEC and MICINN, as well as European Commission FEDER funds, under Grants CSD2006-00046 and TIN2009-14475-C04. It was also partly supported by the project NaNoC (project label 248972) which is funded by the European Commission within the Research Programme FP7.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "3D chip; 3D Stacking; 3D stacking; chip multiprocessors; CMP; effective 3D stacking; external memory; Fault Tolerance; fault tolerance; Fault Tolerance; Fault tolerant systems; fault-tolerant vertical link design; memory bandwidth limitation; Memory management; microprocessor chips; network-on-chip; NoC; Stacking; storage management chips; Three dimensional displays; three-dimensional integrated circuits; through silicon vias; TSV", number-of-cited-references = "20", oa = "Green Published", ORCID-numbers = "Silla, Federico/0000-0002-6435-1200 Hernandez, Carles/0000-0001-5393-3195", research-areas = "Computer Science", times-cited = "1", unique-id = "Hernandez:2011:FTV", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Choi:2011:EID, author = "Inseok Choi and Minshu Zhao and Xu Yang and Donald Yeung", title = "Experience with Improving Distributed Shared Cache Performance on {Tilera}'s {Tile} Processor", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "2", pages = "45--48", month = jul # "\slash " # dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.18", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper describes our experience with profiling and optimizing physical locality for the distributed shared cache (DSC) in Tilera's Tile multicore processor. Our approach uses the Tile Processor's hardware performance measurement counters (PMCs) to acquire page-level access pattern profiles. A key problem we address is imprecise PMC interrupts. Our profiling tools use binary analysis to correct for interrupt ``skid'', thus pinpointing individual memory operations that incur remote DSC slice references and permitting us to sample their access patterns. We use our access pattern profiles to drive page homing optimizations for both heap and static data objects. Our experiments show we can improve physical locality for 5 out of 11 SPLASH2 benchmarks running on 32 cores, enabling 32.9\%-77.9\% of DSC references to target the local DSC slice. To our knowledge, this is the first work to demonstrate page homing optimizations on a real system.", acknowledgement = ack-nhfb, affiliation = "Choi, I (Reprint Author), Univ Maryland, Dept Elect \& Comp Engn, College Pk, MD 20742 USA. Choi, Inseok; Zhao, Minshu; Yang, Xu; Yeung, Donald, Univ Maryland, Dept Elect \& Comp Engn, College Pk, MD 20742 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "inseok@umd.edu mszhao@umd.edu yangxu@umd.edu yeung@umd.edu", da = "2019-06-20", doc-delivery-number = "855NW", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; binary analysis; cache storage; Computer architecture; Data streams; Design methodology; Design studies; distributed shared cache performance; hardware performance measurement counters; microprocessor chips; Multi-core/single-chip multiprocessors; Multicore processing; Multiple Data Stream Architectures (Multiprocessors); multiprocessing systems; Multiprocessing systems; page homing optimization; page-level access pattern profile; PMC interrupt; profiling tool; Tilera tile multicore processor", number-of-cited-references = "19", research-areas = "Computer Science", times-cited = "5", unique-id = "Choi:2011:EID", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Prieto:2011:MCM, author = "Pablo Prieto and Valentin Puente and Jose-Angel Gregorio", title = "Multilevel Cache Modeling for Chip-Multiprocessor Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "2", pages = "49--52", month = jul # "\slash " # dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.20", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper presents a simple analytical model for predicting on-chip cache hierarchy effectiveness in chip multiprocessors (CMP) for a state-of-the-art architecture. Given the complexity of this type of systems, we use rough approximations, such as the empirical observation that the re-reference timing pattern follows a power law and the assumption of a simplistic delay model for the cache, in order to provide a useful model for the memory hierarchy responsiveness. This model enables the analytical determination of average access time, which makes design space pruning useful before sweeping the vast design space of this class of systems. The model is also useful for predicting cache hierarchy behavior in future systems. The fidelity of the model has been validated using a state-of-the-art, full-system simulation environment, on a system with up to sixteen out-of-order processors with cache-coherent caches and using a broad spectrum of applications, including complex multithread workloads. This simple model can predict a near-to-optimal, on-chip cache distribution while also estimating how future systems running future applications might behave.", acknowledgement = ack-nhfb, affiliation = "Prieto, P (Reprint Author), Univ Cantabria, Cantabria, Spain. Prieto, Pablo; Puente, Valentin; Gregorio, Jose-Angel, Univ Cantabria, Cantabria, Spain.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "prietop@unican.es vpuente@unican.es monaster@unican.es", da = "2019-06-20", doc-delivery-number = "855NW", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Spanish Ministry of Science and Innovation [TIN2010-18159]; HiPEAC2 European Network of Excellence", funding-text = "This work has been supported by the Spanish Ministry of Science and Innovation, under contracts TIN2010-18159, and by the HiPEAC2 European Network of Excellence. The authors would like to thank the reviewers for their valuable comments.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "approximation theory; cache hierarchy behavior prediction; cache storage; Cache storage; cache-coherent caches; chip-multiprocessor systems; complex multithread workloads; Complexity theory; Computational modeling; design space; integrated circuit design; Memory hierarchy; memory hierarchy responsiveness; microprocessor chips; Multi-core/single-chip multiprocessors; multilevel cache modeling; multiprocessing systems; Multiprocessing systems; near-to-optimal on-chip cache distribution; on-chip cache hierarchy effectiveness prediction; power law; re-reference timing pattern; rough approximations; simplistic delay model assumption; Software tools; Thermal analysis; Thermal sensors", number-of-cited-references = "13", ORCID-numbers = "Prieto, Pablo/0000-0002-5818-1188 Puente, Valentin/0000-0002-6904-3282 Gregorio, Jose Angel/0000-0003-2214-303X", research-areas = "Computer Science", times-cited = "3", unique-id = "Prieto:2011:MCM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Siozios:2011:SRT, author = "Kostas Siozios and Dimitrios Rodopoulos and Dimitrios Soudris", title = "On Supporting Rapid Thermal Analysis", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "2", pages = "53--56", month = jul # "\slash " # dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.19", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Detailed thermal analysis is usually performed exclusively at design time since it is a computationally intensive task. In this paper, we introduce a novel methodology for fast, yet accurate, thermal analysis. The introduced methodology is software supported by a new open source tool that enables hierarchical thermal analysis with adaptive levels of granularity. Experimental results prove the efficiency of our approach since it leads to average reduction of the execution overhead up to 70\% with a penalty in accuracy ranging between 2\% and 8\%.", acknowledgement = ack-nhfb, affiliation = "Siozios, K (Reprint Author), Natl Tech Univ Athens, Sch ECE, GR-10682 Athens, Greece. Siozios, Kostas; Rodopoulos, Dimitrios; Soudris, Dimitrios, Natl Tech Univ Athens, Sch ECE, GR-10682 Athens, Greece.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "855NW", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Complexity theory; Computational modeling; Computer Systems Organization; Design Methodologies; General; Hardware; hierarchical thermal analysis; Modeling techniques; Monitoring; open source tool; Performance of Systems; Power Management; public domain software; rapid thermal analysis; Reconfigurable Hardware; Reconfigurable hardware; Reliability; software engineering; software supported; Software tools; thermal analysis; Thermal analysis; Thermal Monitoring; Thermal sensors", number-of-cited-references = "8", ORCID-numbers = "Siozios, Kostas/0000-0002-0285-2202 Soudris, Dimitrios/0000-0002-6930-6847", research-areas = "Computer Science", researcherid-numbers = "Soudris, Dimitrios/I-5252-2014 Siozios, Kostas/F-9726-2011 Soudris, Dimitrios/O-8843-2019", times-cited = "3", unique-id = "Siozios:2011:SRT", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2011:Cd, author = "Anonymous", title = "Cover 3", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "2", pages = "c3--c3", month = jul # "\slash " # dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.30", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2011:FCb, author = "Anonymous", title = "[{Front} cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "2", pages = "c1--c1", month = jul # "\slash " # dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.28", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2011:ICS, author = "Anonymous", title = "{IEEE Computer Society} [society information]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "2", pages = "c4--c4", month = jul # "\slash " # dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.31", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2011:PI, author = "Anonymous", title = "Publication information", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "2", pages = "c2--c2", month = jul # "\slash " # dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.29", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Sethumadhavan:2012:CHD, author = "Simha Sethumadhavan and Ryan Roberts and Yannis Tsividis", title = "A Case for Hybrid Discrete-Continuous Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "1", pages = "1--4", month = jan # "\slash " # jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.22", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Current technology trends indicate that power- and energy-efficiency will limit chip throughput in the future. Current solutions to these problems, either in the way of programmable or fixed-function digital accelerators will soon reach their limits as microarchitectural overheads are successively trimmed. A significant departure from current computing methods is required to carry forward computing advances beyond digital accelerators. In this paper we describe how the energy-efficiency of a large class of problems can be improved by employing a hybrid of the discrete and continuous models of computation instead of the ubiquitous, traditional discrete model of computation. We present preliminary analysis of domains and benchmarks that can be accelerated with the new model. Analysis shows that machine learning, physics and up to one-third of SPEC, RMS and Berkeley suite of applications can be accelerated with the new hybrid model.", acknowledgement = ack-nhfb, affiliation = "Sethumadhavan, S (Reprint Author), Columbia Univ, New York, NY 10027 USA. Sethumadhavan, Simha; Roberts, Ryan; Tsividis, Yannis, Columbia Univ, New York, NY 10027 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "simha@cs.columbia.edu", da = "2019-06-20", doc-delivery-number = "953VM", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "DARPA; AFRL [FA8750-10-2-0253, FA9950-09-1-0389]; NSF", funding-text = "Sethumadhavan's research is funded by grants from DARPA, AFRL (FA8750-10-2-0253, FA9950-09-1-0389), the NSF CAREER program, gifts from Microsoft Research and Columbia University, and software donations from Synopsys and Wind River. Roberts conducted this research as a GRA in Sethumadhavan's Lab.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Approximation algorithms; Benchmark testing; Berkeley suite; Computational modeling; Computer architecture; computer architecture; Computer architecture; computer architecture; computing methods; continuous models; cryptography; Design studies; Differential equations; discrete model; discrete models; domains analysis; energy conservation; energy-efficiency; fixed-function digital accelerators; forward computing advances; hybrid discrete-continuous architectures; Hybrid systems; machine learning; Mathematical model; microarchitectural overheads; microprocessor chips; power-efficiency; Processor architectures; RMS; SPEC; Very large scale integration", number-of-cited-references = "16", research-areas = "Computer Science", times-cited = "4", unique-id = "Sethumadhavan:2012:CHD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kong:2012:ASF, author = "Ji Kong and Peilin Liu and Yu Zhang", title = "Atomic Streaming: a Framework of On-Chip Data Supply System for Task-Parallel {MPSoCs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "1", pages = "5--8", month = jan # "\slash " # jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.21", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "State of the art fabrication technology for integrating numerous hardware resources such as Processors/DSPs and memory arrays into a single chip enables the emergence of Multiprocessor System-on-Chip (MPSoC). Stream programming paradigm based on MPSoC is highly efficient for single functionality scenario due to its dedicated and predictable data supply system. However, when memory traffic is heavily shared among parallel tasks in applications with multiple interrelated functionalities, performance suffers through task interferences and shared memory congestions which lead to poor parallel speedups and memory bandwidth utilizations. This paper proposes a framework of stream processing based on-chip data supply system for task-parallel MPSoCs. In this framework, stream address generations and data computations are decoupled and parallelized to allow full utilization of on-chip resources. Task granularities are dynamically tuned to jointly optimize the overall application performance. Experiments show that proposed framework as well as the tuning scheme are effective for joint optimization in task-parallel MPSoCs.", acknowledgement = ack-nhfb, affiliation = "Kong, J (Reprint Author), Shanghai Jiao Tong Univ, Sch Elect Informat \& Elect Engn, Shanghai 200030, Peoples R China. Kong, Ji; Liu, Peilin, Shanghai Jiao Tong Univ, Sch Elect Informat \& Elect Engn, Shanghai 200030, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "johnhophen@sjtu.edu.cn liupeilin@sjtu.edu.cn zhyu@cn.ibm.com", da = "2019-06-20", doc-delivery-number = "953VM", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "IBM Research-China under the IBM", funding-text = "This work has been partially supported by IBM Research-China under the IBM Ph.D. Fellowship program for the 2010-2011 academic year.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Application studies resulting in better multiple-processor systems; atomic streaming; Bandwidth; data computations; Memory hierarchy; Multi-core/single-chip multiprocessors; Multicore processing; Multiple Data Stream Architectures (Multiprocessors); Multiprocessing systems; multiprocessor system-on-chip; on-chip data supply system; Prefetching; shared memory congestions; shared memory systems; stream address generations; stream programming paradigm; Streaming media; System-on-a-chip; system-on-chip; task interferences; task-parallel MPSoC; Throughput", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "0", unique-id = "Kong:2012:ASF", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Deb:2012:HSC, author = "Abhishek Deb and Josep Maria Codina and Antonio Gonzalez", title = "A {HW\slash SW} Co-designed Programmable Functional Unit", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "1", pages = "9--12", month = jan # "\slash " # jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.23", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "In this paper, we propose a novel programmable functional unit (PFU) to accelerate general purpose application execution on a modern out-of-order x86 processor. Code is transformed and instructions are generated that run on the PFU using a co-designed virtual machine (Cd-VM). Results presented in this paper show that this HW/SW co-designed approach produces average speedups in performance of 29\% in SPECFP and 19\% in SPECINT, and up-to 55\%, over modern out-of-order processor.", acknowledgement = ack-nhfb, affiliation = "Deb, A (Reprint Author), Univ Politecn Cataluna, C Jordi Girona 1-3, Barcelona, Spain. Deb, Abhishek; Gonzalez, Antonio, Univ Politecn Cataluna, Barcelona, Spain. Maria Codina, Josep; Gonzalez, Antonio, Intel Res Labs Barcelona, Barcelona, Spain.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "abhishek@ac.upc.edu josep.m.codina@intel.com antonio@intel.com", da = "2019-06-20", doc-delivery-number = "953VM", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; hardware-software codesign; Hardware/software interfaces; hw/sw co-designed; Interface states; Load modeling; Micro-architecture implementation considerations; Microarchitecture; Processor Architectures; programmable functional unit; Programmable functional units; Registers; virtual machine", number-of-cited-references = "13", ORCID-numbers = "Gonzalez, Antonio/0000-0002-0009-0996", research-areas = "Computer Science", researcherid-numbers = "Gonzalez, Antonio/I-2961-2014", times-cited = "0", unique-id = "Deb:2012:HSC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Piscitelli:2012:HLP, author = "Roberta Piscitelli and Andy D. Pimentel", title = "A High-Level Power Model for {MPSoC} on {FPGA}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "1", pages = "13--16", month = jan # "\slash " # jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.24", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper presents a framework for high-level power estimation of multiprocessor systems-on-chip (MPSoC) architectures on FPGA. The technique is based on abstract execution profiles, called event signatures. As a result, it is capable of achieving good evaluation performance, thereby making the technique highly useful in the context of early system-level design space exploration. We have integrated the power estimation technique in a system-level MPSoC synthesis framework. Using this framework, we have designed a range of different candidate MPSoC architectures and compared our power estimation results to those from real measurements on a Virtex-6 FPGA board.", acknowledgement = ack-nhfb, affiliation = "Piscitelli, R (Reprint Author), Univ Amsterdam, Inst Informat, NL-1012 WX Amsterdam, Netherlands. Piscitelli, Roberta; Pimentel, Andy D., Univ Amsterdam, Inst Informat, NL-1012 WX Amsterdam, Netherlands.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "r.piscitelli@uva.nl a.d.pimentel@uva.nl", da = "2019-06-20", doc-delivery-number = "953VM", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "MADNESS STREP", funding-text = "This work has been partially supported by the MADNESS STREP-FP7 European Project.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "abstract execution profiles; Computational modeling; Computer architecture; Estimation; event signatures; Field programmable gate arrays; field programmable gate arrays; Field programmable gate arrays; Formal models; High-level power estimation; high-level power estimation framework; high-level power model; integrated circuit design; Mathematical model; Microprocessors; MPSoC on FPGA; multiprocessing systems; multiprocessor systems-on-chip architectures; Performance Analysis and Design Aids; performance evaluation; power aware computing; Power demand; power estimation technique; Simulation; system-level design space exploration; system-level MPSoC design space exploration; system-level MPSoC synthesis framework; system-on-chip; Virtex-6 FPGA board", number-of-cited-references = "15", research-areas = "Computer Science", times-cited = "0", unique-id = "Piscitelli:2012:HLP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Finlayson:2012:OSP, author = "Ian Finlayson and Gang-Ryung Uh and David Whalley and Gary Tyson", title = "An Overview of Static Pipelining", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "1", pages = "17--20", month = jan # "\slash " # jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.26", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "A new generation of mobile applications requires reduced energy consumption without sacrificing execution performance. In this paper, we propose to respond to these conflicting demands with an innovative statically pipelined processor supported by an optimizing compiler. The central idea of the approach is that the control during each cycle for each portion of the processor is explicitly represented in each instruction. Thus the pipelining is in effect statically determined by the compiler. The benefits of this approach include simpler hardware and that it allows the compiler to perform optimizations that are not possible on traditional architectures. The initial results indicate that static pipelining can significantly reduce power consumption without adversely affecting performance.", acknowledgement = ack-nhfb, affiliation = "Finlayson, I (Reprint Author), Florida State Univ, Dept Comp Sci, Tallahassee, FL 32306 USA. Finlayson, Ian; Whalley, David; Tyson, Gary, Florida State Univ, Dept Comp Sci, Tallahassee, FL 32306 USA. Uh, Gang-Ryung, Boise State Univ, Dept Comp Sci, Boise, ID 83725 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "finlayso@cs.fsu.edu uh@cs.boisestate.edu whalley@cs.fsu.edu tyson@cs.fsu.edu", da = "2019-06-20", doc-delivery-number = "953VM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [CNS-0964413, CNS-0915926]", funding-text = "We thank the anonymous reviewers for their constructive comments and suggestions. This research was supported in part by NSF grants CNS-0964413 and CNS-0915926.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; Computer architecture; Energy consumption; energy consumption reduction; execution performance; General; mobile applications; optimising compilers; Optimization; optimizing compiler; Pipeline processing; pipeline processing; Pipeline processors; power aware computing; Radio frequency; Registers; statically pipelined processor", number-of-cited-references = "14", oa = "Green Published", research-areas = "Computer Science", times-cited = "6", unique-id = "Finlayson:2012:OSP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Wu:2012:CID, author = "Lisa Wu and Martha A. Kim and Stephen A. Edwards", title = "Cache Impacts of Datatype Acceleration", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "1", pages = "21--24", month = jan # "\slash " # jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.25", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Hardware acceleration is a widely accepted solution for performance and energy efficient computation because it removes unnecessary hardware for general computation while delivering exceptional performance via specialized control paths and execution units. The spectrum of accelerators available today ranges from coarse-grain off-load engines such as GPUs to fine-grain instruction set extensions such as SSE. This research explores the benefits and challenges of managing memory at the data-structure level and exposing those operations directly to the ISA. We call these instructions Abstract Datatype Instructions (ADIs). This paper quantifies the performance and energy impact of ADIs on the instruction and data cache hierarchies. For instruction fetch, our measurements indicate that ADIs can result in 21-48\% and 16-27\% reductions in instruction fetch time and energy respectively. For data delivery, we observe a 22-40\% reduction in total data read/write time and 9-30\% in total data read/write energy.", acknowledgement = ack-nhfb, affiliation = "Wu, L (Reprint Author), Columbia Univ, Dept Comp Sci, New York, NY 10027 USA. Wu, Lisa; Kim, Martha A.; Edwards, Stephen A., Columbia Univ, Dept Comp Sci, New York, NY 10027 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "lisa@cs.columbia.edu martha@cs.columbia.edu sedwards@cs.columbia.edu", da = "2019-06-20", doc-delivery-number = "953VM", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "abstract data types; abstract datatype instruction; Accelerators; ADI; cache hierarchy; Cache Hierarchy; cache hierarchy; Cache memories; cache storage; coarse grain off-load engine; data read-write energy; data structure level; Data Structures; energy conservation; energy efficient computation; energy impact; execution unit; fine grain instruction set extension; hardware acceleration; Hardware acceleration; hardware acceleration; Hardware/software interfaces; Instruction fetch; instruction fetch energy; instruction fetch time; Instruction Set Extensions; instruction sets; ISA; Memory hierarchy; memory management; Memory Structures; Multicore processing; power aware computing; Program processors; Support vector machines; Vectors", number-of-cited-references = "15", ORCID-numbers = "Edwards, Stephen/0000-0003-2609-4861", research-areas = "Computer Science", times-cited = "0", unique-id = "Wu:2012:CID", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2012:RL, author = "Anonymous", title = "2011 Reviewers List", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "1", pages = "25--26", month = jan # "\slash " # jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.12", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Lists the reviewers who contributed to IEEE Computer Architecture Letters in 2011.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "IEEE publishing", } @Article{Anonymous:2012:TNQ, author = "Anonymous", title = "There now is a quick and easy way to find out about our collection of {{\booktitle{Transactions}}} [Advertisement]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "1", pages = "26--26", month = jan # "\slash " # jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.19", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Advertisement: Visit http://www.computer.org/whats-new today!", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2012:ACP, author = "Anonymous", title = "Advertisement --- {Conference Publishing Services (CPS)}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "1", pages = "28--28", month = jan # "\slash " # jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.13", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "IEEE Conference Publishing Services (CPS) advertisement.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2012:AI, author = "Anonymous", title = "2011 Annual Index", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "1", pages = "??--??", month = jan # "\slash " # jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.11", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This index covers all technical items --- papers, correspondence, reviews, etc. --- that appeared in this periodical during the year, and items from previous years that were commented upon or corrected in this year. Departments and other items may also be covered if they have been judged to have archival value. The Author Index contains the primary entry for each item, listed under the first author's name. The primary entry includes the co-authors' names, the title of the paper or other item, and its location, specified by the publication abbreviation, year, month, and inclusive pagination. The Subject Index contains entries describing the item under all appropriate subject headings, plus the first author's name, the publication abbreviation, month, and year, and inclusive pages. Note that the item title is found only under he primary entry in the Author Index.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Indexes", } @Article{Anonymous:2012:Ca, author = "Anonymous", title = "{[Cover2]}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "1", pages = "c2--c2", month = jan # "\slash " # jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.15", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Provides a listing of current society officers.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2012:Cb, author = "Anonymous", title = "{[Cover3]}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "1", pages = "c3--c3", month = jan # "\slash " # jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.16", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Provides a listing of current society officers.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2012:FCT, author = "Anonymous", title = "[{Front} cover and table of contents]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "1", pages = "c1--c1", month = jan # "\slash " # jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.14", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Presents the table of contents for this issue of the periodical.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2012:ICS, author = "Anonymous", title = "{IEEE Computer Society} [Back cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "1", pages = "c4--c4", month = jan # "\slash " # jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.17", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Provides a listing of current committee members and society officers.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Davis:2012:IVL, author = "John D. Davis and Suzanne Rivoire and Moises Goldszmidt and Ehsan K. Ardestani", title = "Including Variability in Large-Scale Cluster Power Models", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "2", pages = "29--32", month = jul # "\slash " # dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.27", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Studying the energy efficiency of large-scale computer systems requires models of the relationship between resource utilization and power consumption. Prior work on power modeling assumes that models built for a single node will scale to larger groups of machines. However, we find that inter-node variability in homogeneous clusters leads to substantially different models for different nodes. Moreover, ignoring this variability will result in significant prediction errors when scaled to the cluster level. We report on inter-node variation for four homogeneous five-node clusters using embedded, laptop, desktop, and server processors. The variation is manifested quantitatively in the prediction error and qualitatively on the resource utilization variables (features) that are deemed relevant for the models. These results demonstrate the need to sample multiple machines in order to produce accurate cluster models.", acknowledgement = ack-nhfb, affiliation = "Rivoire, Suzanne, Sonoma State Univ, Rohnert Pk, CA 94928 USA. Ardestani, Ehsan K., Univ CA, Santa Cruz, CA USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "john.d@microsoft.com suzanne.rivoire@sonoma.edu moises@microsoft.com eka@soe.ucsc.edu", da = "2019-06-20", doc-delivery-number = "057JO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computational modeling; Data models; evaluation; Measurement; modeling; Power demand; Power Management; Power measurement; Predictive models; Radiation detectors; Servers; simulation of multiple-processor systems", number-of-cited-references = "26", research-areas = "Computer Science", times-cited = "3", unique-id = "Davis:2012:IVL", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Lakshminarayana:2012:DSP, author = "Nagesh B. Lakshminarayana and Jaekyu Lee and Hyesoon Kim and Jinwoo Shin", title = "{DRAM} Scheduling Policy for {GPGPU} Architectures Based on a Potential Function", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "2", pages = "33--36", month = jul # "\slash " # dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.32", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "GPGPU architectures (applications) have several different characteristics compared to traditional CPU architectures (applications): highly multithreaded architectures and SIMD-execution behavior are the two important characteristics of GPGPU computing. In this paper, we propose a potential function that models the DRAM behavior in GPGPU architectures and a DRAM scheduling policy, alpha-SJF policy to minimize the potential function. The scheduling policy essentially chooses between SJF and FR-FCFS at run-time based on the number of requests from each thread and whether the thread has a row buffer hit.", acknowledgement = ack-nhfb, affiliation = "Lakshminarayana, NB (Reprint Author), Georgia Inst Technol, Sch Comp Sci, Atlanta, GA 30332 USA. Lakshminarayana, Nagesh B.; Lee, Jaekyu; Kim, Hyesoon; Shin, Jinwoo, Georgia Inst Technol, Sch Comp Sci, Atlanta, GA 30332 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "nageshbl@cc.gatech.edu jaekyu.lee@cc.gatech.edu hyesoon.kim@cc.gatech.edu jshin72@cc.gatech.edu", da = "2019-06-20", doc-delivery-number = "057JO", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; Computer architecture; DRAM chips; DRAM scheduling; DRAM scheduling policy; dynamic random access memory; Equations; general-purpose graphics processing unit; GPGPU; GPGPU architecture; graphics processing units; Instruction sets; Mathematical model; multi-threading; multithreaded architecture; Potential function; potential function; Potential function; Processor scheduling; Random access memory; row buffer hit; scheduling; SIMD-execution behavior", number-of-cited-references = "5", research-areas = "Computer Science", researcherid-numbers = "Shin, Jinwoo/M-5389-2013", times-cited = "7", unique-id = "Lakshminarayana:2012:DSP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Wang:2012:ISA, author = "Yaohua Wang and Shuming Chen and Kai Zhang and Jianghua Wan and Xiaowen Chen and Hu Chen and Haibo Wang", title = "Instruction Shuffle: Achieving {MIMD}-like Performance on {SIMD} Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "2", pages = "37--40", month = jul # "\slash " # dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.34", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "SIMD architectures are less efficient for applications with the diverse control-flow behavior, which can be mainly attributed to the requirement of the identical control-flow. In this paper, we propose a novel instruction shuffle scheme that features an efficient control-flow handling mechanism. The cornerstones are composed of a shuffle source instruction buffer array and an instruction shuffle unit. The shuffle unit can concurrently deliver instructions of multiple distinct control-flows from the instruction buffer array to eligible SIMD lanes. Our instruction shuffle scheme combines the best attributes of both the SIMD and MIMD execution paradigms. Experimental results show that, an average performance improvement of 86\% can be achieved, at a cost of only 5.8\% area overhead.", acknowledgement = ack-nhfb, affiliation = "Wang, YH (Reprint Author), Natl Univ Def Technol, Sch Comp Sci, Changsha, Hunan, Peoples R China. Wang, Yaohua; Chen, Shuming; Zhang, Kai; Wan, Jianghua; Chen, Xiaowen; Chen, Hu; Wang, Haibo, Natl Univ Def Technol, Sch Comp Sci, Changsha, Hunan, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "nudtyh@gmail.com", da = "2019-06-20", doc-delivery-number = "057JO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Natural Science Foundation of China [61070036, 61133007]; National 863 Program of China [2009AA011704]", funding-text = "The work is partially supported by the National Natural Science Foundation of China (No. 61070036), the National Natural Science Foundation of China (No. 61133007), the National 863 Program of China (No. 2009AA011704).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Arrays; data dependent control-flow; diverse control-flow behavior; identical control-flow behavior; instruction buffer array; Instruction sets; instruction shuffle; instruction shuffle unit; Kernel; MIMD execution paradigm; MIMD-like performance; multiple instruction multiple data; parallel processing; Process control; Resource management; Scalability; shuffle source instruction buffer array; SIMD; SIMD architecture; SIMD execution paradigm; single instruction multiple data; Vectors", number-of-cited-references = "9", research-areas = "Computer Science", researcherid-numbers = "Chen, Shuming/Q-1147-2018", times-cited = "6", unique-id = "Wang:2012:ISA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Panda:2012:BFB, author = "Reena Panda and Paul V. Gratz and Daniel A. Jim{\'e}nez", title = "{B-Fetch}: Branch Prediction Directed Prefetching for In-Order Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "2", pages = "41--44", month = jul # "\slash " # dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.33", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Computer architecture is beset by two opposing trends. Technology scaling and deep pipelining have led to high memory access latencies; meanwhile, power and energy considerations have revived interest in traditional in-order processors. In-order processors, unlike their superscalar counterparts, do not allow execution to continue around data cache misses. In-order processors, therefore, suffer a greater performance penalty in the light of the current high memory access latencies. Memory prefetching is an established technique to reduce the incidence of cache misses and improve performance. In this paper, we introduce B-Fetch, a new technique for data prefetching which combines branch prediction based lookahead deep path speculation with effective address speculation, to efficiently improve performance in in-order processors. Our results show that B-Fetch improves performance 38.8\% on SPEC CPU2006 benchmarks, beating a current, state-of-the-art prefetcher design at similar to 1/3 the hardware overhead.", acknowledgement = ack-nhfb, affiliation = "Panda, R (Reprint Author), Texas A\&M Univ, Dept Elect \& Comp Engn, CESG, College Stn, TX 77843 USA. Panda, Reena; Gratz, Paul V., Texas A\&M Univ, Dept Elect \& Comp Engn, CESG, College Stn, TX 77843 USA. Jimenez, Daniel A., Univ Texas San Antonio, Dept Comp Sci, San Antonio, TX USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "reena.panda@tamu.edu pgratz@tamu.edu dj@cs.utsa.edu", da = "2019-06-20", doc-delivery-number = "057JO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "address speculation; B-fetch; Benchmark testing; Branch Prediction; branch prediction based lookahead deep path speculation; branch prediction directed prefetching; Cache memory; computer architecture; Computer architecture; data cache; Data Cache Prefetching; deep pipelining; energy consideration; Hardware; in-order processor; In-order Processors; memory access latency; memory prefetching; Memory Systems; Pipelines; power aware computing; power consideration; Prefetching; Process control; Registers; storage management; superscalar processor; technology scaling; Value Prediction", number-of-cited-references = "17", research-areas = "Computer Science", times-cited = "4", unique-id = "Panda:2012:BFB", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Miller:2012:MEP, author = "Timothy N. Miller and Renji Thomas and Radu Teodorescu", title = "Mitigating the Effects of Process Variation in Ultra-low Voltage Chip Multiprocessors using Dual Supply Voltages and Half-Speed Units", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "2", pages = "45--48", month = jul # "\slash " # dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.36", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Energy efficiency is a primary concern for microprocessor designers. One very effective approach to improving processor energy efficiency is to lower its supply voltage to very near to the transistor threshold voltage. This reduces power consumption dramatically, improving energy efficiency by an order of magnitude. Low voltage operation, however, increases the effects of parameter variation resulting in significant frequency heterogeneity between (and within) otherwise identical cores. This heterogeneity severely limits the maximum frequency of the entire CMP. We present a combination of techniques aimed at reducing the effects of variation on the performance and energy efficiency of near-threshold, many-core CMPs. Dual Voltage Rail (DVR), mitigates core-to-core variation with a dual-rail power delivery system that allows post-manufacturing assignment of different supply voltages to individual cores. This speeds up slow cores by assigning them to a higher voltage and saves power on fast cores by assigning them to a lower voltage. Half-Speed Unit (HSU) mitigates within-core variation by halving the frequency of select functional blocks with the goal of boosting the frequency of individual cores, thus raising the frequency ceiling for the entire CMP. Together, these variation-reduction techniques result in almost 50\% improvement in CMP performance for the same power consumption over a mix of workloads.", acknowledgement = ack-nhfb, affiliation = "Miller, TN (Reprint Author), Ohio State Univ, Dept Comp Sci \& Engn, Columbus, OH 43210 USA. Miller, Timothy N.; Thomas, Renji; Teodorescu, Radu, Ohio State Univ, Dept Comp Sci \& Engn, Columbus, OH 43210 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "millerti@cse.ohio-state.edu thomasr@cse.ohio-state.edu teodores@cse.ohio-state.edu", da = "2019-06-20", doc-delivery-number = "057JO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Science Foundation [CCF-1117799]", funding-text = "This work was supported in part by the National Science Foundation under grant CCF-1117799 and an allocation of computing time from the Ohio Supercomputer Center. The authors would like to thank the anonymous reviewers for their suggestions and feedback.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; chip multiprocessors; Clocks; CMP frequency ceiling; CMP performance; Computer architecture; core-to-core variation; Delay; dual supply voltage; dual voltage rail; dual-rail power delivery system; energy conservation; Energy efficiency; energy efficiency; Energy efficiency; frequency heterogeneity; half-speed unit; low voltage operation; microprocessor chips; microprocessor design; Multiprocessing systems; near-threshold voltage; parameter variation; power aware computing; power consumption; Power demand; process variation; process variation effect; Rails; supply voltage assignment; Threshold voltage; transistor threshold voltage; ultra-low voltage chip multiprocessors; within-core variation", number-of-cited-references = "15", research-areas = "Computer Science", times-cited = "6", unique-id = "Miller:2012:MEP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Li:2012:LSS, author = "Yong Li and Rami Melhem and Alex K. Jones", title = "Leveraging Sharing in Second Level Translation-Lookaside Buffers for Chip Multiprocessors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "2", pages = "49--52", month = jul # "\slash " # dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.35", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Traversing page table during virtual to physical address translation causes significant pipeline stalls when misses occur in the translation-lookaside buffer (TLB). To mitigate this penalty, we propose a fast, scalable, multi-level TLB organization that leverages page sharing behaviors and performs efficient TLB entry placement. Our proposed partial sharing TLB (PSTLB) reduces TLB misses by around 60\%. PSTLB also improves TLB performance by nearly 40\% compared to traditional private TLBs and 17\% over the state of the art scalable TLB proposal.", acknowledgement = ack-nhfb, affiliation = "Li, Y (Reprint Author), Univ Pittsburgh, Dept Elect \& Comp Engn, Pittsburgh, PA 15261 USA. Li, Yong, Univ Pittsburgh, Dept Elect \& Comp Engn, Pittsburgh, PA 15261 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "yol26@pitt.edu", da = "2019-06-20", doc-delivery-number = "057JO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [CCF-0702452]", funding-text = "This work is supported by NSF award CCF-0702452", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; buffer storage; chip multiprocessor; CMPs; Fluids; microprocessor chips; multilevel TLB organization; multiprocessing systems; Oceans; page sharing behavior; Partial Sharing; partial sharing TLB; Prefetching; private TLB; program interpreters; Runtime; second level translation-lookaside buffers; Tiles; TLB entry placement; TLBs; Virtual private networks; virtual-to-physical address translation", number-of-cited-references = "12", research-areas = "Computer Science", times-cited = "1", unique-id = "Li:2012:LSS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Delimitrou:2012:DDS, author = "Christina Delimitrou and Sriram Sankar and Kushagra Vaid and Christos Kozyrakis", title = "Decoupling Datacenter Storage Studies from Access to Large-Scale Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "2", pages = "53--56", month = jul # "\slash " # dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.37", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Suboptimal storage design has significant cost and power impact in large-scale datacenters (DCs). Performance, power and cost-optimized systems require deep understanding of target workloads, and mechanisms to effectively model different storage design choices. Traditional benchmarking is invalid in cloud data-stores, representative storage profiles are hard to obtain, while replaying applications in different storage configurations is impractical both in cost and time. Despite these issues, current workload generators are not able to reproduce key aspects of real application patterns (e.g., spatial/temporal locality, I/O intensity). In this paper, we propose a modeling and generation framework for large-scale storage applications. As part of this framework we use a state diagram-based storage model, extend it to a hierarchical representation, and implement a tool that consistently recreates DC application I/O loads. We present the principal features of the framework that allow accurate modeling and generation of storage workloads, and the validation process performed against ten original DC application traces. Finally, we explore two practical applications of this methodology: SSD caching and defragmentation benefits on enterprise storage. Since knowledge of the workload's spatial and temporal locality is necessary to model these use cases, our framework was instrumental in quantifying their performance benefits. The proposed methodology provides detailed understanding of the storage activity of large-scale applications, and enables a wide spectrum of storage studies, without the requirement to access application code and full application deployment.", acknowledgement = ack-nhfb, affiliation = "Delimitrou, C (Reprint Author), Stanford Univ, Stanford, CA 94305 USA. Delimitrou, Christina; Kozyrakis, Christos, Stanford Univ, Stanford, CA 94305 USA. Sankar, Sriram; Vaid, Kushagra, Microsoft Corp, Seattle, WA USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "cdel@stanford.edu srsankar@microsoft.com kvaid@microsoft.com kozyraki@stanford.edu", da = "2019-06-20", doc-delivery-number = "057JO", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cloud data-store; Computational modeling; computer centres; cost impact; datacenter storage; Electronic mail; enterprise storage defragmentation; Generators; large-scale datacenter; Load modeling; Mass storage; Modeling of computer architecture; Modeling techniques; power impact; SSD caching; state diagram-based storage model; Storage area networks; storage design choice; storage management; storage profile; storage workload; suboptimal storage design; Super (very large) computers; Throughput; Very large scale integration; workload spatial locality; workload temporal locality", number-of-cited-references = "7", research-areas = "Computer Science", times-cited = "0", unique-id = "Delimitrou:2012:DDS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Chen:2012:NPD, author = "Jie Chen and Guru Venkataramani and Gabriel Parmer", title = "The Need for Power Debugging in the Multi-Core Environment", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "2", pages = "57--60", month = jul # "\slash " # dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.1", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Debugging an application for power has a wide array of benefits ranging from minimizing the thermal hotspots to reducing the likelihood of CPU malfunction. In this work, we justify the need for power debugging, and show that performance debugging of a parallel application does not automatically guarantee power balance across multiple cores. We perform experiments and show our results using two case study benchmarks, Volrend from Splash-2 and Bodytrack from Parsec-1.0.", acknowledgement = ack-nhfb, affiliation = "Chen, J (Reprint Author), George Washington Univ, Washington, DC 20052 USA. Chen, Jie; Venkataramani, Guru; Parmer, Gabriel, George Washington Univ, Washington, DC 20052 USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "057JO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Science Foundation [CCF-1117243]", funding-text = "This material is based upon work supported in part by the National Science Foundation under Grant No. CCF-1117243.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; Bodytrack; Debugging; Instruction sets; Multi-cores; multicore environment; Multicore processing; multiprocessing systems; parallel application; parallel programming; Parsec-1.0; performance debugging; power aware computing; power balance; Power Debugging; power debugging; Power Debugging; Power demand; Power Imbalance; program debugging; Splash-2; Volrend", number-of-cited-references = "18", research-areas = "Computer Science", times-cited = "2", unique-id = "Chen:2012:NPD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Meza:2012:EES, author = "Justin Meza and Jichuan Chang and HanBin Yoon and Onur Mutlu and Parthasarathy Ranganathan", title = "Enabling Efficient and Scalable Hybrid Memories Using Fine-Granularity {DRAM} Cache Management", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "2", pages = "61--64", month = jul # "\slash " # dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.2", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Hybrid main memories composed of DRAM as a cache to scalable non-volatile memories such as phase-change memory (PCM) can provide much larger storage capacity than traditional main memories. A key challenge for enabling high-performance and scalable hybrid memories, though, is efficiently managing the metadata (e.g., tags) for data cached in DRAM at a fine granularity. Based on the observation that storing metadata off-chip in the same row as their data exploits DRAM row buffer locality, this paper reduces the overhead of fine-granularity DRAM caches by only caching the metadata for recently accessed rows on-chip using a small buffer. Leveraging the flexibility and efficiency of such a fine-granularity DRAM cache, we also develop an adaptive policy to choose the best granularity when migrating data into DRAM. On a hybrid memory with a 512MB DRAM cache, our proposal using an 8KB on-chip buffer can achieve within 6\% of the performance of, and 18\% better energy efficiency than, a conventional 8MB SRAM metadata store, even when the energy overhead due to large SRAM metadata storage is not considered.", acknowledgement = ack-nhfb, affiliation = "Meza, J (Reprint Author), Carnegie Mellon Univ, Pittsburgh, PA 15213 USA. Meza, Justin; Yoon, HanBin; Mutlu, Onur, Carnegie Mellon Univ, Pittsburgh, PA 15213 USA. Chang, Jichuan; Ranganathan, Parthasarathy, Hewlett Packard Labs, Palo Alto, CA USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "meza@cmu.edu jichuan.chang@hp.com hanbinyoon@cmu.edu onur@cmu.edu partha.ranganathan@hp.com", da = "2019-06-20", doc-delivery-number = "057JO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF CAREER [CCF-0953246]; NSF EAGER [CCF-1147397]; Gigascale Systems Research Center", funding-text = "We thank the members of the SAFARI research group and the anonymous reviewers for their comments and suggestions. We gratefully acknowledge the support of an NSF CAREER Award CCF-0953246, NSF EAGER Grant CCF-1147397, and the Gigascale Systems Research Center. Part of this work was done while Justin Meza and HanBin Yoon were interns at Hewlett-Packard Labs.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Buffer storage; Cache memories; Cache memory; cache storage; data migration; DRAM chips; DRAM row buffer locality; dynamic random access memory; fine-granularity DRAM cache management; hybrid main memories; hybrid main memory; Indexes; Memory management; meta data; metadata caching; metadata management; metadata storage; non-volatile memories; Phase change materials; phase-change memory; Random access memory; scalable hybrid memory; System-on-a-chip; tag storage", number-of-cited-references = "16", research-areas = "Computer Science", times-cited = "35", unique-id = "Meza:2012:EES", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Zidenberg:2012:MHS, author = "Tsahee Zidenberg and Isaac Keslassy and Uri Weiser", title = "{MultiAmdahl}: How Should {I} Divide My Heterogeneous Chip?", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "2", pages = "65--68", month = jul # "\slash " # dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.3", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Future multiprocessor chips will integrate many different units, each tailored to a specific computation. When designing such a system, a chip architect must decide how to distribute the available limited system resources, such as area and power, among all the computational units. In this paper, we introduce MultiAmdahl, an analytical optimization technique for resource sharing among heterogeneous units. MultiAmdahl takes into account the workload, the performance of each computational unit, and the total available resource. The results obtained by MultiAmdahl allow us, for example, to provide a closed-form solution for an optimal asymmetric-offload chip, and to analyze the impact of different design constraints on an optimal chip architecture.", acknowledgement = ack-nhfb, affiliation = "Zidenberg, T (Reprint Author), Technion Israel Inst Technol, EE Dept, Haifa, Israel. Zidenberg, Tsahee; Keslassy, Isaac; Weiser, Uri, Technion Israel Inst Technol, EE Dept, Haifa, Israel.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "tsahee@tx.technion.ac.il isaac@ee.technion.ac.il weiser@ee.technion.ac.il", da = "2019-06-20", doc-delivery-number = "057JO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "European Research Council [210389]; Intel Heterogeneous Computing research grant", funding-text = "This work was partly supported by the European Research Council Starting Grant No. 210389 and by the Intel Heterogeneous Computing research grant.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "area resource; asymmetric-offload chip; Central Processing Unit; Chip Multiprocessors; Computational modeling; computational unit; Computer architecture; design constraint; heterogeneous chip; heterogeneous unit; Mathematical model; microprocessor chips; Modeling of computer architecture; MultiAmdahl analytical optimization technique; multiprocessing systems; multiprocessor chip; optimal chip architecture; Optimization; power resource; Program processors; resource allocation; Resource management; resource sharing", keywords-plus = "AMDAHLS LAW", number-of-cited-references = "7", research-areas = "Computer Science", times-cited = "12", unique-id = "Zidenberg:2012:MHS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2012:BC, author = "Anonymous", title = "[{Back} cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "2", pages = "c4--c4", month = jul # "\slash " # dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.38", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2012:BIC, author = "Anonymous", title = "[{Back} inside cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "2", pages = "c3--c3", month = jul # "\slash " # dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.37", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2012:FIC, author = "Anonymous", title = "[{Front} inside cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "2", pages = "c2--c2", month = jul # "\slash " # dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.36", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Skadron:2013:INE, author = "Kevin Skadron", title = "Introducing the New {Editor-in-Chief} of the {{\booktitle{IEEE Computer Architecture Letters}}}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "1", pages = "1--1", month = jan # "\slash " # jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.15", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The out-going Editor-in-Chief introduces Jose F. Mart{\'\i}nez as the new Editor-in-Chief (EIC) of the IEEE Computer Architecture Letters (CAL). A brief professional biography is included. In addition, it is noted that CAL aims to provide fast-turnaround for early work with outstanding promise. The majority of decisions are returned within one month, nearly all within six weeks, and all decisions are rendered within two months. The overall acceptance rate has consistently run at about 25\%. Many papers first published in CAL go on to become full papers in premier conferences and journals, and CAL's impact factor continues to increase. CAL has been a valuable addition to the publishing landscape in computer architecture and under Prof. Martinez's leadership, we can look forward to even greater impact in the future. I would like to take this opportunity to thank all of the CAL Associate Editors, authors, readers, and reviewers for their great help and support.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "172HT", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", number-of-cited-references = "0", research-areas = "Computer Science", times-cited = "0", unique-id = "Skadron:2013:INE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2013:AI, author = "Anonymous", title = "2012 Annual Index", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "1", pages = "1--4", month = jan # "\slash " # jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.10", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This index covers all technical items - papers, correspondence, reviews, etc. - that appeared in this periodical during the year, and items from previous years that were commented upon or corrected in this year. Departments and other items may also be covered if they have been judged to have archival value. The Author Index contains the primary entry for each item, listed under the first author's name. The primary entry includes the co-authors' names, the title of the paper or other item, and its location, specified by the publication abbreviation, year, month, and inclusive pagination. The Subject Index contains entries describing the item under all appropriate subject headings, plus the first author's name, the publication abbreviation, month, and year, and inclusive pages. Note that the item title is found only under the primary entry in the Author Index.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Indexes", } @Article{Eeckhout:2013:MNE, author = "Lieven Eeckhout", title = "A Message from the New {Editor-in-Chief} and Introduction of New {Associate Editors}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "1", pages = "2--2", month = jan # "\slash " # jun, year = "2013", CODEN = "????", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, affiliation = "Eeckhout, L (Reprint Author), Univ Ghent, B-9000 Ghent, Belgium.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "172HT", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", number-of-cited-references = "0", research-areas = "Computer Science", times-cited = "0", unique-id = "Eeckhout:2013:MNE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Martinez:2013:MNE, author = "J. Martinez", title = "A Message from the New {Editor-in-Chief} and Introduction of New {Associate} Editors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "1", pages = "2--4", month = jan # "\slash " # jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.12", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The incoming Editor-in-Chief states that his goal during his tenure with IEEE Computer Architecture Letters (CAL) will be to further increase its visibility in our research community, and to attract more submissions from computer architecture leaders. The {"Best} of {CAL"} session at HPCA, which has taken place for the last couple of years, is a good step in this direction. He is also committed to continue improving the coordination with authors and conference program chairs, and to consolidate CAL's unique place in the publication pipeline as the prime venue for quick dissemination of high-quality novel ideas and early results.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Tavakkol:2013:NSS, author = "Arash Tavakkol and Mohammad Arjomand and Hamid Sarbazi-Azad", title = "{Network-on-SSD}: a Scalable and High-Performance Communication Design Paradigm for {SSDs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "1", pages = "5--8", month = jan # "\slash " # jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.4", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "In recent years, flash memory solid state disks (SSDs) have shown a great potential to change storage infrastructure because of its advantages of high speed and high throughput random access. This promising storage, however, greatly suffers from performance loss because of frequent ``erase-before-write'' and ``garbage collection'' operations. Thus. novel circuit-level, architectural, and algorithmic techniques are currently explored to address these limitations. In parallel with others, current study investigates replacing shared buses in multi-channel architecture of SSDs with an interconnection network to achieve scalable, high throughput, and reliable SSD storage systems. Roughly speaking, such a communication scheme provides superior parallelism that allows us to compensate the main part of the performance loss related to the aforementioned limitations through increasing data storage and retrieval processing throughput.", acknowledgement = ack-nhfb, affiliation = "Tavakkol, A (Reprint Author), Sharif Univ Technol, Dept Comp Engn, HPCAN Lab, Tehran, Iran. Tavakkol, Arash; Arjomand, Mohammad; Sarbazi-Azad, Hamid, Sharif Univ Technol, Dept Comp Engn, HPCAN Lab, Tehran, Iran. Sarbazi-Azad, Hamid, Inst Res Fundamental Sci IPM, Sch Comp Sci, Tehran, Iran.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "tavakkol@ce.sharif.edu arjomand@ce.sharif.edu azad@sharif.edu", da = "2019-06-20", doc-delivery-number = "172HT", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "algorithmic technique; architectural technique; Bandwidth; Buffer storage; circuit-level technique; Complexity theory; Data storage systems; data storage throughput; flash memories; Flash memory; flash memory solid state disks; frequent erase-before-write operations; garbage collection operations; high speed random access; high throughput random access; high-performance communication design paradigm; integrated circuit design; integrated circuit reliability; Inter-package parallelism; interconnection network; Interconnection network; interconnection network; Interconnections (Subsystems); Mass storage; memory architecture; multichannel architecture; multiprocessor interconnection networks; network-on-chip; network-on-SSD; parallel memories; Parallel processing; parallel storage; performance evaluation; performance loss; retrieval processing throughput; scalable communication design paradigm; Solid state disk; SSD storage system reliability; storage infrastructure; storage management; system buses; Throughput", keywords-plus = "MEMORY", number-of-cited-references = "6", research-areas = "Computer Science", times-cited = "1", unique-id = "Tavakkol:2013:NSS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Sun:2013:NWC, author = "Guang Sun and Chia-Wei Chang and Bill Lin", title = "A New Worst-Case Throughput Bound for Oblivious Routing in Odd Radix Mesh Network", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "1", pages = "9--12", month = jan # "\slash " # jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.5", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "1/2 network capacity is often believed to be the limit of worst-case throughput for mesh networks. However, this letter provides a new worst-case throughput bound, which is higher than 1/2 network capacity, for odd radix two-dimensional mesh networks. In addition, we propose a routing algorithm called U2TURN that can achieve this worst-case throughput bound. U2TURN considers all routing paths with at most 2 turns and distributes the traffic loads uniformly in both X and Y dimensions. Theoretical analysis and simulation results show that U2TURN outperforms existing routing algorithms in worst-case throughput. Moreover, U2TURN achieves good average-throughput at the expense of approximately 1.5x minimal average hop count.", acknowledgement = ack-nhfb, affiliation = "Sun, G (Reprint Author), Tsinghua Univ, Beijing, Peoples R China. Sun, Guang, Tsinghua Univ, Beijing, Peoples R China. Chang, Chia-Wei; Lin, Bill, Univ Calif San Diego, San Diego, CA 92103 USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "172HT", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Algorithm design and analysis; average-case throughput; Computer architecture; Interconnection architectures; mesh; Mesh networks; network capacity; network-on-chip; Networks-on-Chip (NoC); oblivious routing; odd radix mesh network; odd radix two-dimensional mesh network; On-chip interconnection networks; Parallel algorithms; Routing; routing; Routing; Routing protocols; Throughput; traffic load; U2TURN; Worst-case analysis; worst-case throughput; worst-case throughput bound", number-of-cited-references = "10", ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X", research-areas = "Computer Science", researcherid-numbers = "Lin, Binshan/A-9772-2009", times-cited = "1", unique-id = "Sun:2013:NWC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Karsli:2013:EDT, author = "I. Burak Karsli and Pedro Reviriego and M. Fatih Balli and O{\u{g}}uz Ergin and J. A. Maestro", title = "Enhanced Duplication: a Technique to Correct Soft Errors in Narrow Values", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "1", pages = "13--16", month = jan # "\slash " # jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.6", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Soft errors are transient errors that can alter the logic value of a register bit causing data corruption. They can be caused by radiation particles such as neutrons or alpha particles. Narrow values are commonly found in the data consumed or produced by processors. Several techniques have recently been proposed to exploit the unused bits in narrow values to protect them against soft errors. These techniques replicate the narrow value over the unused register bits such that errors can be detected when the value is duplicated and corrected when the value is tripled. In this letter, a technique that can correct errors when the narrow value is only duplicated is presented. The proposed approach stores a modified duplicate of the narrow value such that errors on the original value and the duplicate can be distinguished and therefore corrected. The scheme has been implemented at the circuit level to evaluate its speed and also at the architectural level to assess the benefits in correcting soft errors. The results show that the scheme is significantly faster than a parity check and can improve substantially the number of soft errors that are corrected compared to existing techniques.", acknowledgement = ack-nhfb, affiliation = "Karsli, IB (Reprint Author), TOBB Univ Econ \& Technol, Ankara, Turkey. Karsli, I. Burak; Balli, M. Fatih; Ergin, O{\u{g}}uz, TOBB Univ Econ \& Technol, Ankara, Turkey. Reviriego, Pedro; Maestro, J. A., Univ Antonio de Nebrija, Madrid, Spain.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "172HT", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Spanish Ministry of Science and Education [AYA2009-13300-C03]; Scientific and Technological Research Council of Turkey (TUBITAK) [112E004]", funding-text = "This work was supported in part by the Spanish Ministry of Science and Education under Grant AYA2009-13300-C03 and by the Scientific and Technological Research Council of Turkey (TUBITAK) under Grant 112E004. The work is a collaboration in the framework of COST ICT Action 1103 ``Manufacturable and Dependable Multicore Architectures at Nanoscale''.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "alpha particles; architectural level; Benchmark testing; computer architecture; Data Cache; data corruption; Data processing; enhanced duplication; Error correction; Error Correction; Error correction; Error-checking; Logic gates; logic value; microprocessor chips; narrow values; Narrow Values; narrow values; neutrons; Parity check codes; processors; Program processors; radiation hardening (electronics); radiation particles; Redundant design; register bit; Registers; soft errors; Soft Errors; soft errors", number-of-cited-references = "11", ORCID-numbers = "Sousa, Leonel/0000-0002-8066-221X Ergin, O{\u{g}}uz/0000-0003-2701-3787 Maestro, Juan Antonio/0000-0001-7133-9026 Reviriego, Pedro/0000-0001-6805-6519", research-areas = "Computer Science", researcherid-numbers = "Sousa, Leonel/B-2749-2009 Ergin, O{\u{g}}uz/E-5717-2010 Maestro, Juan Antonio/L-6091-2014 Reviriego, Pedro/B-8353-2009", times-cited = "2", unique-id = "Karsli:2013:EDT", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Lyons:2013:SFF, author = "Michael Lyons and Gu-Yeon Wei and David Brooks", title = "{Shrink-Fit}: a Framework for Flexible Accelerator Sizing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "1", pages = "17--20", month = jan # "\slash " # jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.7", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "RTL design complexity discouraged adoption of reconfigurable logic in general purpose systems, impeding opportunities for performance and energy improvements. Recent improvements to HLS compilers simplify RTL design and are easing this barrier. A new challenge will emerge: managing reconfigurable resources between multiple applications with custom hardware designs. In this paper, we propose a method to ``shrink-fit' accelerators within widely varying fabric budgets. Shrink-fit automatically shrinks existing accelerator designs within small fabric budgets and grows designs to increase performance when larger budgets are available. Our method takes advantage of current accelerator design techniques and introduces a novel architectural approach based on fine-grained virtualization. We evaluate shrink-fit using a synthesized implementation of an IDCT for decoding JPEGs and show the IDCT accelerator can shrink by a factor of 16x with minimal performance and area overheads. Using shrink-fit, application designers can achieve the benefits of hardware acceleration with single RTL designs on FPGAs large and small.", acknowledgement = ack-nhfb, affiliation = "Lyons, M (Reprint Author), Harvard Univ, Sch Engn \& Appl Sci, Cambridge, MA 02138 USA. Lyons, Michael; Wei, Gu-Yeon; Brooks, David, Harvard Univ, Sch Engn \& Appl Sci, Cambridge, MA 02138 USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "172HT", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Accelerators; computational complexity; Computer applications; custom hardware design; Decoding; discrete cosine transforms; fabric budget; field programmable gate arrays; Field programmable gate arrays; fine grained virtualization; flexible accelerator sizing; FPGA; general purpose computers; general purpose system; hardware acceleration; Heterogeneous (hybrid) systems; HLS compiler; IDCT accelerator; inverse transforms; JPEG decoding; program compilers; Program processors; reconfigurable architectural approach; reconfigurable architectures; Reconfigurable hardware; reconfigurable logic; reconfigurable resource management; RTL design complexity; Runtime; shrink fit accelerator; Special-Purpose and Application-Based Systems; temporal logic; virtual machines; virtualisation", number-of-cited-references = "12", research-areas = "Computer Science", times-cited = "0", unique-id = "Lyons:2013:SFF", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Duong:2013:CAS, author = "Nam Duong and Alexander V. Veidenbaum", title = "Compiler-Assisted, Selective Out-Of-Order Commit", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "1", pages = "21--24", month = jan # "\slash " # jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.8", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper proposes an out-of-order instruction commit mechanism using a novel compiler/architecture interface. The compiler creates instruction ``blocks'' guaranteeing some commit conditions and the processor uses the block information to commit certain instructions out of order. Micro-architectural support for the new commit mode is made on top of the standard, ROB-based processor and includes out-of-order instruction commit with register and load queue entry release. The commit mode may be switched multiple times during execution. Initial results for a 4-wide processor show that, on average, 52\% instructions are committed out of order resulting in 10\% to 26\% speedups over in-order commit, with minimal hardware overhead. The performance improvement is a result of an effectively larger instruction window that allows more cache misses to be overlapped for both L1 and L2 caches.", acknowledgement = ack-nhfb, affiliation = "Duong, N (Reprint Author), Univ Calif Irvine, Dept Comp Sci, Irvine, CA 92717 USA. Duong, Nam; Veidenbaum, Alexander V., Univ Calif Irvine, Dept Comp Sci, Irvine, CA 92717 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "nlduong@ics.uci.edu alexv@ics.uci.edu", da = "2019-06-20", doc-delivery-number = "172HT", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "architecture/compiler co-design; Benchmark testing; block information; cache misses; cache storage; Cache storage; cache storage; Cache storage; commit conditions; compiler-architecture interface; compiler-assisted selective out-of-order commit; computer architecture; Computer architecture; computer architecture; dynamically-scheduled and statically-scheduled implementation; Hardware/software interfaces; instruction blocks; instruction sets; L1 cache; L2 cache; load queue entry release; microarchitectural support; minimal hardware overhead; Out of order instruction; Out-of-order commit; out-of-order instruction commit mechanism; overlapping cache misses; performance evaluation; performance improvement; Pipeline implementation; Pipeline processors; program compilers; Program processors; register; resource release; RISC/CISC; ROB-based processor; Superscalar; VLIW architectures; Von Neumann architectures", number-of-cited-references = "9", research-areas = "Computer Science", times-cited = "2", unique-id = "Duong:2013:CAS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Nilakantan:2013:MES, author = "Siddharth Nilakantan and Steven Battle and Mark Hempstead", title = "Metrics for Early-Stage Modeling of Many-Accelerator Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "1", pages = "25--28", month = jan # "\slash " # jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.9", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The term `Dark Silicon'' has been coined to describe the threat to microprocessor performance caused by increasing transistor power density. Improving energy efficiency is now the primary design goal for all market segments of microprocessors from mobile to server. Specialized hardware accelerators, designed to run only a subset of workloads with orders of magnitude energy efficiency improvement, are seen as a potential solution. Selecting an ensemble of accelerators to best cover the workloads run on a platform remains a challenge. We propose metrics for accelerator selection derived from a detailed communication-aware performance model and present an automated methodology to populate this model. Employing a combination of characterized RTL and our selection metrics, we evaluate a set of accelerators for a sample application and compare performance to selections based on execution time and Pollack's rule. We find that the architecture selected by our communication-aware metric shows improved performance over architectures selected based on execution time and Pollack's rule, as they do not account for speedup being limited by communication.", acknowledgement = ack-nhfb, affiliation = "Nilakantan, S (Reprint Author), Drexel Univ, Dept Elect \& Comp Engn, Philadelphia, PA 19104 USA. Nilakantan, Siddharth; Battle, Steven; Hempstead, Mark, Drexel Univ, Dept Elect \& Comp Engn, Philadelphia, PA 19104 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "sn446@drexel.edu sjb328@drexel.edu mdh77@drexel.edu", da = "2019-06-20", doc-delivery-number = "172HT", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Accelerators; Code Profiling; communication-aware performance model; Computer architecture; computer architecture; Computer Systems Organization; dark silicon; General; hardware accelerators; Heterogeneous (hybrid) systems; Heterogeneous Architectures; magnitude energy efficiency improvement; many-accelerator architectures; microprocessor; microprocessor chips; Modeling; Modeling of computer architecture; modelling; Multiprocessing systems; Other Architecture Styles; performance evaluation; Pollack rule; Processor Architectures; Program processors; RTL; transistor power density; transistors", number-of-cited-references = "16", ORCID-numbers = "Nilakantan, Siddharth/0000-0003-1067-700X", research-areas = "Computer Science", times-cited = "3", unique-id = "Nilakantan:2013:MES", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Delimitrou:2013:NCD, author = "Christina Delimitrou and Christos Kozyrakis", title = "The {Netflix} Challenge: Datacenter Edition", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "1", pages = "29--32", month = jan # "\slash " # jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.10", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The hundreds of thousands of servers in modern warehouse scale systems make performance and efficiency optimizations pressing design challenges. These systems are traditionally considered homogeneous. However, that is not typically the case. Multiple server generations compose a heterogeneous environment, whose performance opportunities have not been fully explored since techniques that account for platform heterogeneity typically do not scale to the tens of thousands of applications hosted in large-scale cloud providers. We present ADSM, a scalable and efficient recommendation system for application-to-server mapping in large-scale datacenters (DCs) that is QoS-aware. ADSM overcomes the drawbacks of previous techniques, by leveraging robust and computationally efficient analytical methods to scale to tens of thousands of applications with minimal overheads. It is also OoS-aware, mapping applications to platforms while enforcing strict QoS guarantees. ADSM is derived from validated analytical models, has low and bounded prediction errors, is simple to implement and scales to thousands of applications without significant changes to the system. Over 390 real DC workloads, ADSM improves performance by 16\% on average and up to 2.5x and efficiency by 22\% in a DC with 10 different server configurations.", acknowledgement = ack-nhfb, affiliation = "Delimitrou, C (Reprint Author), Stanford Univ, Stanford, CA 94305 USA. Delimitrou, Christina; Kozyrakis, Christos, Stanford Univ, Stanford, CA 94305 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "cdel@stanford.edu kozyraki@stanford.edu", da = "2019-06-20", doc-delivery-number = "172HT", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "ADSM; application mapping; Application studies resulting in better multiple-processor systems; application-to-server mapping; Computer architecture; computer centres; Computer System Implementation; Computer Systems Organization; Data centers; datacenter; design challenge; Design studies; evaluation; Heterogeneous (hybrid) systems; Large and Medium ( Mainframe ) Computers; Large-scale systems; Measurement; modeling; Multiprocessing systems; Netflix challenge; Other Architecture Styles; Parallel Architectures; Performance of Systems; Processor Architectures; QoS-aware; quality of service; Scheduling; Scheduling and task partitioning; server generation; simulation of multiple-processor systems; Special-Purpose and Application-Based Systems; Super (very large) computers; warehouse-scale system", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "6", unique-id = "Delimitrou:2013:NCD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2013:RL, author = "Anonymous", title = "2012 reviewers list", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "1", pages = "33--34", month = jan # "\slash " # jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.11", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The publication offers a note of thanks and lists its reviewers.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "IEEE publishing", } @Article{Anonymous:2013:IOAa, author = "Anonymous", title = "{IEEE} Open Access Publishing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "1", pages = "35--35", month = jan # "\slash " # jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.13", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Advertisement: This publication offers open access options for authors. IEEE open access publishing.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2013:ITN, author = "Anonymous", title = "{{\booktitle{IEEE Transactions}}} Newsletter", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "1", pages = "36--36", month = jan # "\slash " # jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.14", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Advertisement: Stay connected with the IEEE Computer Society Transactions by signing up for our new Transactions Connection newsletter. It is free and contains valuable information.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Martinez:2013:E, author = "J. F. Martinez", title = "Editorial", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "2", pages = "37--38", month = jul # "\slash " # dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.32", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Jian:2013:HPE, author = "Xun Jian and John Sartori and Henry Duwe and Rakesh Kumar", title = "High Performance, Energy Efficient Chipkill Correct Memory with Multidimensional Parity", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "2", pages = "39--42", month = jul # "\slash " # dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.21", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "It is well-known that a significant fraction of server power is consumed in memory; this is especially the case for servers with chipkill correct memories. We propose a new chipkill correct memory organization that decouples correction of errors due to local faults that affect a single symbol in a word from correction of errors due to device-level faults that affect an entire column, sub-bank, or device. By using a combination of two codes that separately target these two fault modes, the proposed chipkill correct organization reduces code overhead by half as compared to conventional chipkill correct memories for the same rank size. Alternatively, this allows the rank size to be reduced by half while maintaining roughly the same total code overhead. Simulations using PARSEC and SPEC benchmarks show that, compared to a conventional double chipkill correct baseline, the proposed memory organization, by providing double chipkill correct at half the rank size, reduces power by up to 41\%, 32\% on average over a conventional baseline with the same chipkill correct strength and access granularity that relies on linear block codes alone, at only 1\% additional code overhead.", acknowledgement = ack-nhfb, affiliation = "Jian, X (Reprint Author), Univ Illinois, Urbana, IL USA. Jian, Xun; Sartori, John; Duwe, Henry; Kumar, Rakesh, Univ Illinois, Urbana, IL USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "279CD", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "block codes; chipkill correct; chipkill correct memory organization; code overhead reduction; Computer architecture; device level fault; DRAM; DRAM chips; error correction; error correction codes; fault mode; fault tolerant computing; granular computing; granularity access; linear block code; linear codes; low power; Low power electronics; PARSEC; Random access memory; rank size; reliable memory; server power consumption; Servers; SPEC; storage management", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "6", unique-id = "Jian:2013:HPE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Maddah:2013:DDS, author = "Rakan Maddah and Sangyeun Cho and Rami Melhem", title = "Data Dependent Sparing to Manage Better-Than-Bad Blocks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "2", pages = "43--46", month = jul # "\slash " # dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.20", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We forecast that proper handling of unreliable storage blocks (e.g., ``bad block management'' in solid-state drives) will remain critical for future systems built with advanced and emerging memory technologies. This paper argues that the conventional block retirement and sparing approach --- a block is retired as soon as it shows faulty behavior --- is overly conservative and inefficient. We observe that it is highly unlikely that all faulty bits in a storage block manifest errors. Consequently, we propose data dependent sparing, a relaxed block retirement and sparing approach that recycles faulty storage blocks. At small management cost and with less than 1\% sparing, data dependent sparing achieves the same lifetime as the conventional approach with 20\% sparing.", acknowledgement = ack-nhfb, affiliation = "Maddah, R (Reprint Author), Univ Pittsburgh, Dept Comp Sci, Pittsburgh, PA 15260 USA. Maddah, Rakan; Cho, Sangyeun; Melhem, Rami, Univ Pittsburgh, Dept Comp Sci, Pittsburgh, PA 15260 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "rmaddah@cs.pitt.edu cho@cs.pitt.edu melhem@cs.pitt.edu", da = "2019-06-20", doc-delivery-number = "279CD", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [CCF-1064976, CCF-1059283, CNS-1012070]", funding-text = "This work is supported in part by NSF grants CCF-1064976, CCF-1059283, and CNS-1012070.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "better-than-bad block management; data dependent sparing; data dependent sparing approach; Data storage systems; fault tolerant computing; faulty bits; faulty storage blocks; flash memory; Flash memory; flash memory; management cost; memory technologies; phase change memories; phase-change memory; phase-change memory (PCM); relaxed block retirement approach; solid-state drive; solid-state drive (SSD); Solid-state drives; solid-state drives; Sparing; sparing; storage block; storage management; stuck-at faults; unreliable storage block handling", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "2", unique-id = "Maddah:2013:DDS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kim:2013:CFC, author = "Hanjoon Kim and Yonggon Kim and John Kim", title = "Clumsy Flow Control for High-Throughput Bufferless On-Chip Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "2", pages = "47--50", month = jul # "\slash " # dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.22", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Bufferless on-chip networks are an alternative type of on-chip network organization that can improve the cost-efficiency of an on-chip network by removing router input buffers. However, bufferless on-chip network performance degrades at high load because of the increased network contention and large number of deflected packets. The energy benefit of bufferless network is also reduced because of the increased deflection. In this work, we propose a novel flow control for bufferless on-chip networks in high-throughput manycore accelerator architectures to reduce the impact of deflection routing. By using a clumsy flow control (CFC), instead of the per-hop flow control that is commonly used in buffered on-chip networks, we are able to reduce the amount of deflection by up to 92\% on high-throughput workloads. As a result, on average, CFC can approximately match the performance of a baseline buffered router while reducing the energy consumption by approximately 39\%.", acknowledgement = ack-nhfb, affiliation = "Kim, H (Reprint Author), Korea Adv Inst Sci \& Technol, Dept Comp Sci, Taejon, South Korea. Kim, Hanjoon; Kim, Yonggon; Kim, John, Korea Adv Inst Sci \& Technol, Dept Comp Sci, Taejon, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "hanj@kaist.ac.kr ilios@kaist.ac.kr jjk12@kaist.ac.kr", da = "2019-06-20", doc-delivery-number = "279CD", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "MKE, Korea, under the ITRC [NIPA-2012-H0301-12-1011]; BST program through the NRF of Korea; MEST [2012-0003579]", funding-text = "This research was supported in part by the MKE, Korea, under the ITRC support program supervised by the NIPA (NIPA-2012-H0301-12-1011) and in part by BST program through the NRF of Korea funded by the MEST(2012-0003579).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "bufferless NoC; bufferless router; CFC; clumsy flow control; computer architecture; Computer architecture; Computer Systems Organization; cost-efficiency improvement; Data processing; deflection routing; deflection routing impact reduction; energy benefit; energy consumption reduction; flow control; high-throughput bufferless on-chip networks; high-throughput manycore accelerator architectures; high-throughput workloads; Interconnection architectures; microprocessor chips; Multiple Data Stream Architectures (Multiprocessors); Multiprocessing systems; network contention; network routing; network-on-chip; On-chip interconnection networks; on-chip network organization; on-chip networks; Parallel architectures; Parallel Architectures; performance evaluation; Processor Architectures; router input buffer removal; System-on-chip", number-of-cited-references = "14", research-areas = "Computer Science", researcherid-numbers = "Kim, John/C-1792-2011", times-cited = "7", unique-id = "Kim:2013:CFC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kai:2013:GRP, author = "Yi Kai and Yi Wang and Bin Liu", title = "{GreenRouter}: Reducing Power by Innovating Router's Architecture", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "2", pages = "51--54", month = jul # "\slash " # dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.23", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "High speed routers in Internet are becoming more powerful, as well as more energy hungry. In this paper, we present a new architecture of router named GreenRouter which separates a line-card into two parts: network interface card (DB) and packet processing card (MB), connected by a two-stage switch fabric in traffic flows' ingress and egress direction respectively. Traffic from all DBs shares all the MBs in GreenRouter, thus can be aggregated to a few active MBs on demand and other MBs can be shut down to save power. Several key issues to this new architecture are addressed. We evaluate the power saving efficiency and give preliminary simulation results. GreenRouter can well adapt the traffic fluctuation and real trace evaluations over one week shows that up to 63.7\% power saving can be achieved while QoS constraints are guaranteed.", acknowledgement = ack-nhfb, affiliation = "Liu, B (Reprint Author), Tsinghua Univ, Dept Comp Sci \& Technol, Beijing 100084, Peoples R China. Kai, Yi; Wang, Yi; Liu, Bin, Tsinghua Univ, Dept Comp Sci \& Technol, Beijing 100084, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "kaiyi02@gmail.com pig020623@gmail.com lmyujie@gmail.com", da = "2019-06-20", doc-delivery-number = "279CD", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSFC [61073171]; Tsinghua University Initiative Scientific Research Program [20121080068]; Specialized Research Fund for the Doctoral Program of Higher Education of China [20100002110051]", funding-text = "This work is supported by NSFC (61073171), Tsinghua University Initiative Scientific Research Program (20121080068), Specialized Research Fund for the Doctoral Program of Higher Education of China (20100002110051).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; DB; Energy efficiency; energy-aware system; green computing; Green design; GreenRouter; High-speed networks; Internet; line-card; low power design; MB; network interface card; packet processing card; power reduction; power saving efficiency; QoS constraints; router; router architecture innovation; Routers; telecommunication network routing; Telecommunication traffic; telecommunication traffic; traffic flow egress direction; traffic flow ingress direction; traffic fluctuation; two-stage switch fabric", number-of-cited-references = "6", ORCID-numbers = "Wang, Yi/0000-0002-9095-6879", research-areas = "Computer Science", researcherid-numbers = "Wang, Yi/A-8884-2015", times-cited = "1", unique-id = "Kai:2013:GRP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Joo:2013:HPS, author = "Yongsoo Joo and Sangsoo Park", title = "A Hybrid {PRAM} and {STT--RAM} Cache Architecture for Extending the Lifetime of {PRAM} Caches", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "2", pages = "55--58", month = jul # "\slash " # dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.24", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "To extend the lifetime of phase change RAM (PRAM) caches, we propose a hybrid cache architecture that integrates a relatively small capacity of spin transfer torque RAM (STT--RAM) write buffer with a PRAM cache. Our hybrid cache improves the endurance limitation of the PRAM cache by judiciously redirecting the write traffic from an upper memory layer to the STT--RAM write buffer. We have demonstrated through simulation that the proposed hybrid cache outperforms existing write-traffic reduction schemes with the same area overhead. Moreover, our approach is orthogonal to the existing schemes, providing an effective way of investing die area for cache lifetime extension by being used in combination with them.", acknowledgement = ack-nhfb, affiliation = "Joo, Y (Reprint Author), Ewha Womans Univ, Dept Comp Sci \& Engn, Seoul 120750, South Korea. Joo, Yongsoo; Park, Sangsoo, Ewha Womans Univ, Dept Comp Sci \& Engn, Seoul 120750, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "ysjoo@ewha.ac.kr sangsoo.park@ewha.ac.kr", da = "2019-06-20", doc-delivery-number = "279CD", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Ewha Womans University", funding-text = "We thank Guangyu Sun and Cong Xu for their helpful comments on NVRAM characteristics. This research was supported by RP-Grant 2010 of Ewha Womans University. Sangsoo Park is the corresponding author.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache; cache lifetime extension; Cache memories; Cache storage; cache storage; Computer architecture; concurrency theory; Design Styles; endurance; Fault tolerance; Hardware; hybrid cache architecture; hybrid PRAM caches; investing die area; lifetime; memory layer; Memory Structures; phase change memories; phase change RAM; PRAM; Random access memory; Redundancy; Redundant design; Reliability; spin transfer torque RAM; STT RAM cache architecture; STT RAM write buffer; STT--RAM; Testing and Fault-Tolerance; write traffic reduction schemes", number-of-cited-references = "14", research-areas = "Computer Science", times-cited = "10", unique-id = "Joo:2013:HPS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Blem:2013:MMA, author = "Emily Blem and Hadi Esmaeilzadeh and Renee St Amant and Karthikeyan Sankaralingam and Doug Burger", title = "Multicore Model from Abstract Single Core Inputs", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "2", pages = "59--62", month = jul # "\slash " # dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.27", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper describes a first order multicore model to project a tighter upper bound on performance than previous Amdahl's Law based approaches. The speedup over a known baseline is a function of the core performance, microarchitectural features, application parameters, chip organization, and multicore topology. The model is flexible enough to consider both CPU and GPU like organizations as well as modern topologies from symmetric to aggressive heterogeneous (asymmetric, dynamic, and fused) designs. This extended model incorporates first order effects-exposing more bottlenecks than previous applications of Amdahl's Law-while remaining simple and flexible enough to be adapted for many applications.", acknowledgement = ack-nhfb, affiliation = "Blem, E (Reprint Author), Univ Wisconsin, Madison, WI 53706 USA. Blem, Emily; Sankaralingam, Karthikeyan, Univ Wisconsin, Madison, WI 53706 USA. Esmaeilzadeh, Hadi, Univ Washington, Seattle, WA 98195 USA. St Amant, Renee, Univ Texas Austin, Austin, TX 78712 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "blem@cs.wisc.edu hadianeh@cs.washington.edu stamant@cs.utexas.edu karu@cs.wisc.edu dburger@microsoft.com", da = "2019-06-20", doc-delivery-number = "279CD", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "abstract single core inputs; aggressive heterogeneous designs; Amdahl law based approach; application parameters; chip organization; Computer Systems Organization; CPU like organizations; first order multicore model; General; GPU like organizations; graphics processing units; microarchitectural features; Modeling of computer architecture; multicore topology; multicores; Multiple Data Stream Architectures (Multiprocessors); multiprocessing systems; network topology; parallelism; performance evaluation; Performance modeling; Processor Architectures", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "1", unique-id = "Blem:2013:MMA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Michaud:2013:DMT, author = "Pierre Michaud", title = "Demystifying Multicore Throughput Metrics", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "2", pages = "63--66", month = jul # "\slash " # dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.25", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Several different metrics have been proposed for quantifying the throughput of multicore processors. There is no clear consensus about which metric should be used. Some studies even use several throughput metrics. We show that there exists a relation between single-thread average performance metrics and throughput metrics, and that throughput metrics inherit the meaning or lack of meaning of the corresponding single-thread metric. We show that two popular throughput metrics, the weighted speedup and the harmonic mean of speedups, are inconsistent: they do not give equal importance to all benchmarks. Moreover we demonstrate that the weighted speedup favors unfairness. We show that the harmonic mean of IPCs, a seldom used throughput metric, is actually consistent and has a physical meaning. We explain under which conditions the arithmetic mean or the harmonic mean of IPCs can be used as a strong indicator of throughput increase.", acknowledgement = ack-nhfb, affiliation = "Michaud, P (Reprint Author), INRIA Rennes, Rennes, France. INRIA Rennes, Rennes, France.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "Pierre.Michaud@inria.fr", da = "2019-06-20", doc-delivery-number = "279CD", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; Computer Systems Organization; evaluation; Measurement; Modeling; modeling; Multi-core/single-chip multiprocessors; Multicore processing; multicore processors; multicore throughput; multicore throughput metrics; multiprocessing systems; Parallel Architectures; Parallel architectures; Performance evaluation; performance metric; Performance of Systems; Processor Architectures; Program processors; simulation of multiple-processor systems; single thread metric; software metrics", number-of-cited-references = "12", research-areas = "Computer Science", times-cited = "7", unique-id = "Michaud:2013:DMT", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Tembey:2013:SSS, author = "Priyanka Tembey and Augusto Vega and Alper Buyuktosunoglu and Dilma {Da Silva} and Pradip Bose", title = "{SMT} Switch: Software Mechanisms for Power Shifting", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "2", pages = "67--70", month = jul # "\slash " # dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.26", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Simultaneous multithreading (SMT) as a processor design to achieve higher levels of system and application throughput is a well-accepted and deployed technique in most desktop and server processors. We study the power implications of varying SMT levels i.e., thread counts per core for various multi-threaded applications on a real SMT multicore platform, and introduce a novel software mechanism of changing SMT level of a core to tune platform power. Power-shifting policies by varying per core SMT levels for performance benefits within a power cap are introduced. Projected power savings (of 15\%) for a streaming parallel benchmark can be attained using SMT-level power shifting mechanisms.", acknowledgement = ack-nhfb, affiliation = "Tembey, P (Reprint Author), Georgia Tech, Atlanta, GA 30332 USA. Tembey, Priyanka, Georgia Tech, Atlanta, GA 30332 USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "279CD", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "application throughput; Computer architecture; Computer Systems Organization; Hardware; multi-threading; Multicore platforms; multiprocessing systems; Multithreaded processors; Multithreading; Operating Systems; Other Architecture Styles; Parallel processing; power aware computing; Power Management; Power shifting; Power system management; Process Management; Processor Architectures; processor design; Program processors; Scheduling; simultaneous multithreading; SMT; SMT multicore platform; SMT switch; SMT-level power shifting mechanism; Software engineering; software mechanisms; Software/Software Engineering; streaming parallel benchmark; tune platform power", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "0", unique-id = "Tembey:2013:SSS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2013:IOAb, author = "Anonymous", title = "{IEEE} Open Access Publishing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "2", pages = "71--71", month = jul # "\slash " # dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.33", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2013:SCI, author = "Anonymous", title = "Stay Connected to the {IEEE Computer Society}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "2", pages = "72--72", month = jul # "\slash " # dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.34", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2013:BC, author = "Anonymous", title = "[{Back} cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "2", pages = "c4--c4", month = jul # "\slash " # dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.31", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2013:BIC, author = "Anonymous", title = "[{Back} inside cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "2", pages = "c3--c3", month = jul # "\slash " # dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.30", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2013:FC, author = "Anonymous", title = "[{Front} cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "2", pages = "c1--c1", month = jul # "\slash " # dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.28", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2013:FIC, author = "Anonymous", title = "[{Front} inside cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "2", pages = "c2--c2", month = jul # "\slash " # dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.29", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Arelakis:2014:CVA, author = "Angelos Arelakis and Per Stenstr{\"o}m", title = "A Case for a Value-Aware Cache", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "1--4", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.31", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Replication of values causes poor utilization of on-chip cache memory resources. This paper addresses the question: How much cache resources can be theoretically and practically saved if value replication is eliminated? We introduce the concept of value-aware caches and show that a sixteen times smaller value-aware cache can yield the same miss rate as a conventional cache. We then make a case for a value-aware cache design using Huffman-based compression. Since the value set is rather stable across the execution of an application, one can afford to reconstruct the coding tree in software. The decompression latency is kept short by our proposed novel pipelined Huffman decoder that uses canonical codewords. While the (loose) upper-bound compression factor is 5.2X, we show that, by eliminating cache-block alignment restrictions, it is possible to achieve a compression factor of 3.4X for practical designs.", acknowledgement = ack-nhfb, affiliation = "Arelakis, A (Reprint Author), Chalmers, Gothenburg, Sweden. Arelakis, Angelos; Stenstrom, Per, Chalmers, Gothenburg, Sweden.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "angelos@chalmers.se per.stenstrom@chalmers.se", da = "2019-06-20", doc-delivery-number = "AT5MU", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Swedish Research Council", funding-text = "This research is supported by the Swedish Research Council. The simulations ran on the resources provided by the Swedish National Infrastructure for Computing (SNIC) at C3SE.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "B Hardware; B.3 Memory Structures; B.3.2 Design Styles; B.3.2.b Cache memories; cache storage; cache-block alignment restriction elimination; Clocks; coding tree reconstruction; data compression; data handling; Decoding; decompression latency; E Data; E.4 Coding and Information Theory; E.4.a Data compaction and compression; Engines; Huffman codes; Huffman coding; Huffman-based compression; Indexes; on-chip cache memory resources; System-on-a-chip; tree codes; value replication; value-aware cache design", number-of-cited-references = "17", research-areas = "Computer Science", times-cited = "3", unique-id = "Arelakis:2014:CVA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Chen:2014:PEC, author = "Zheng Chen and Huaxi Gu and Yintang Yang and Luying Bai and Hui Li", title = "A Power Efficient and Compact Optical Interconnect for Network-on-Chip", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "5--8", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.5", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Optical interconnect is a promising alternative to substitute the electrical interconnect for intra-chip communications. The topology of optical Network-on-Chip (ONoC) has a great impact on the network performance. However, the size of ONoC is limited by the power consumption and crosstalk noise, which are mainly resulted from the waveguide crossings in the topology. In this paper, a diagonal Mesh topology (DMesh) is proposed to relieve the limitation of scalability by reducing the number of waveguide crossing, which is only 20\% that of Mesh. In addition, the number of optical routers in DMesh is less than half of that in Mesh-based ONoC. Due to its compact architecture and favorable scalability, DMesh topology is suitable for large-scale ONoC design.", acknowledgement = ack-nhfb, affiliation = "Chen, Z (Reprint Author), Xidian Univ Xian, State Key Lab Integrated Serv Networks, Xian, Peoples R China. Chen, Zheng; Gu, Huaxi; Bai, Luying; Li, Hui, Xidian Univ Xian, State Key Lab Integrated Serv Networks, Xian, Peoples R China. Yang, Yintang, Xidian Univ Xian, Inst Microelect, Xian, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "chenzheng8331@stu.xidian.edu.cn hxgu@xidian.edu.cn ytyang@xidian.edu.cn", da = "2019-06-20", doc-delivery-number = "AT5MU", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Science Foundation of China [61070046, 60803038]; State Key Lab [ISN1104001]; Fundamental Research Funds for the Central Universities [K5051301003]; 111 Project [B08038]", funding-text = "This work is supported by the National Science Foundation of China Grant No. 61070046 and 60803038, the special fund from State Key Lab Grant No. ISN1104001, the Fundamental Research Funds for the Central Universities Grant No. K5051301003, the 111 Project Grant No. B08038.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "compact optical interconnect; crosstalk noise; diagonal mesh topology; DMesh topology; integrated optoelectronics; intra-chip communications; large-scale ONoC design; mesh-based ONoC; multiprocessors; network performance; Network topology; network-on-chip; optical interconnections; Optical interconnects; optical network-on-chip; optical router; Optical routers; optical routers; power consumption; power efficient interconnect; Topology; topology; Topology; waveguide crossings; wavelength division multiplexing; Wavelength division multiplexing; wavelength division multiplexing", number-of-cited-references = "9", ORCID-numbers = "Gu, Huaxi/0000-0002-6409-2229", research-areas = "Computer Science", times-cited = "2", unique-id = "Chen:2014:PEC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Cota:2014:AMR, author = "Emilio G. Cota and Paolo Mantovani and Michele Petracca and Mario R. Casu and Luca P. Carloni", title = "Accelerator Memory Reuse in the Dark Silicon Era", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "9--12", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.29", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Accelerators integrated on-die with General-Purpose CPUs (GP-CPUs) can yield significant performance and power improvements. Their extensive use, however, is ultimately limited by their area overhead; due to their high degree of specialization, the opportunity cost of investing die real estate on accelerators can become prohibitive, especially for general-purpose architectures. In this paper we present a novel technique aimed at mitigating this opportunity cost by allowing GP-CPU cores to reuse accelerator memory as a non-uniform cache architecture (NUCA) substrate. On a system with a last level-2 cache of 128kB, our technique achieves on average a 25\% performance improvement when reusing four 512 kB accelerator memory blocks to form a level-3 cache. Making these blocks reusable as NUCA slices incurs on average in a 1.89\% area overhead with respect to equally-sized ad hoc cache slices.", acknowledgement = ack-nhfb, affiliation = "Cota, EG (Reprint Author), Columbia Univ, New York, NY 10027 USA. Cota, Emilio G.; Mantovani, Paolo; Carloni, Luca P., Columbia Univ, New York, NY 10027 USA. Petracca, Michele, Cadence Design Syst Inc, San Jose, CA USA. Casu, Mario R., Politecn Torino, Turin, Italy.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "AT5MU", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Science Foundation [1018236, 1219001]; ONR Young Investigator Award; Gigascale Systems Research Center; Focus Center Research Program (FCRP), a Semiconductor Research Corporation entity", funding-text = "This research is partially supported by the National Science Foundation under Awards \#: 1018236 and 1219001, an ONR Young Investigator Award, and the Gigascale Systems Research Center, one of six research centers funded under the Focus Center Research Program (FCRP), a Semiconductor Research Corporation entity. The authors thank John Demme and the anonymous reviewers for their insightful comments.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Acceleration; accelerator architectures; Accelerator architectures; accelerator architectures; accelerator memory reuse; cache formation; Cache memory; cache slice; cache storage; dark silicon era; general purpose CPU; general-purpose architecture; GP-CPU; Memory management; nonuniform cache architecture; NUCA substrate; Power demand; Silicon; Transform coding", keywords-plus = "CACHES", number-of-cited-references = "18", research-areas = "Computer Science", times-cited = "6", unique-id = "Cota:2014:AMR", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Chou:2014:EPE, author = "Yu-Liang Chou and Shaoshan Liu and Eui-Young Chung and Jean-Luc Gaudiot", title = "An Energy and Performance Efficient {DVFS} Scheme for Irregular Parallel Divide-and-Conquer Algorithms on the {Intel SCC}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "13--16", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.1", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The divide-and-conquer paradigm can be used to express many computationally significant problems, but an important subset of these applications is inherently load-imbalanced. Load balancing is a challenge for irregular parallel divide-and-conquer algorithms and efficiently solving these applications will be a key requirement for future many-core systems. To address the load imbalance issue, instead of attempting to dynamically balancing the workloads, this paper proposes an energy and performance efficient Dynamic Voltage and Frequency Scaling (DVFS) scheduling scheme, which takes into account the load imbalance behavior exhibited by these applications. More specifically, we examine the core of the divide-and-conquer paradigm and determine that the base-case-reached point where recursion stops is a suitable place in a divide-and-conquer paradigm to apply the proposed DVFS scheme. To evaluate the proposed scheme, we implement four representative irregular parallel divide-and-conquer algorithms, tree traversal, quicksort, finding primes, and n-queens puzzle, on the Intel Single-chip Cloud Computer (SCC) many-core machine. We demonstrate that, on average, the proposed scheme can improve performance by 41\% while reducing energy consumption by 36\% compared to the baseline running the whole computation with the default frequency configuration (400MHz).", acknowledgement = ack-nhfb, affiliation = "Chou, YL (Reprint Author), Univ Calif Irvine, Irvine, CA 92697 USA. Chou, Yu-Liang; Gaudiot, Jean-Luc, Univ Calif Irvine, Irvine, CA 92697 USA. Liu, Shaoshan, Microsoft Corp, Redmond, WA 98052 USA. Chung, Eui-Young, Yonsei Univ, Seoul 120749, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "d943010010@gmail.com shaoliu@microsoft.com eychung@yonsei.ac.kr gaudiot@uci.edu", da = "2019-06-20", doc-delivery-number = "AT5MU", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "US National Science Foundation [CCF-1065448]; National Research Foundation of Korea (NRF) [2012S1A2A1A01031420]; Ministry of Education, Science and Technology [2012-047670]; National Science Council [NSC 101-2917-I-564-079]", funding-text = "This work is partly supported by the US National Science Foundation under Grant No. CCF-1065448, by the National Research Foundation of Korea (NRF) under Grant No. 2012S1A2A1A01031420, by the Ministry of Education, Science and Technology under Grant No. 2012-047670, and by the National Science Council under Grant No. NSC 101-2917-I-564-079. Any opinions, findings, and conclusions expressed in this material are those of the authors and do not necessarily reflect the views of these sponsors.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "base-case-reached point; D Software/Software Engineering; D.4 Operating Systems; D.4 Operating Systems < D.4.7 Organization and Design; D.4.7.b Distributed systems; D.4.7.f Parallel systems; D.4.8 Performance < D.4.8.a Measurements < Distributed processing; divide and conquer methods; Divide-and-conquer; DVFS; dynamic voltage and frequency scaling; energy conservation; energy consumption reduction; energy efficient DVFS scheme; finding primes; frequency 400 MHz; Intel SCC; Intel single-chip cloud computer; irregular parallel divide-and-conquer algorithms; Load Imbalance; load imbalance behavior; many-core machine; microprocessor chips; multiprocessing systems; n-queens puzzle; Operating systems; parallel algorithms; Parallel processing; performance efficient DVFS scheme; Performance evaluation; power aware computing; processor scheduling; quicksort; recursion stops; resource allocation; Software engineering; tree traversal", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "2", unique-id = "Chou:2014:EPE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Rotem:2014:BUI, author = "Nadav Rotem and Yosi {Ben Asher}", title = "Block Unification {IF}-conversion for High Performance Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "17--20", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.28", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Graphics Processing Units accelerate data-parallel graphic calculations using wide SIMD vector units. Compiling programs to use the GPU's SIMD architectures require converting multiple control flow paths into a single stream of instructions. IF-conversion is a compiler transformation, which converts control dependencies into data dependencies, and it is used by vectorizing compilers to eliminate control flow and enable efficient code generation. In this work we enhance the IF-conversion transformation by using a block unification method to improve the currently used block flattening method. Our experimental results demonstrate that our IF-conversion method is effective in reducing the number of predicated instructions and in boosting kernel execution speed.", acknowledgement = ack-nhfb, affiliation = "Rotem, N (Reprint Author), Univ Haifa, Dept Comp Sci, IL-31999 Haifa, Israel. Rotem, Nadav; Ben Asher, Yosi, Univ Haifa, Dept Comp Sci, IL-31999 Haifa, Israel.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "rotemn@cs.haifa.ac.il yosi@cs.haifa.ac.il", da = "2019-06-20", doc-delivery-number = "AT5MU", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "block flattening method; block unification IF-conversion; block unification method; code generation; Code generation; compiler transformation; Compilers; Computer architecture; data-parallel graphic calculations; GPU SIMD architectures; Graphics processing unit; graphics processing units; high performance architectures; Kernel; Merging; multiple control flow paths; parallel processing; Processors; program compilers; Programming Languages; Registers; Software/Software Engineering; vectorizing compilers; Vectors; wide SIMD vector units", number-of-cited-references = "15", research-areas = "Computer Science", times-cited = "1", unique-id = "Rotem:2014:BUI", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Ilic:2014:CAR, author = "Aleksandar Ilic and Frederico Pratas and Leonel Sousa", title = "Cache-aware Roofline model: Upgrading the loft", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "21--24", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.6", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The Roofline model graphically represents the attainable upper bound performance of a computer architecture. This paper analyzes the original Roofline model and proposes a novel approach to provide a more insightful performance modeling of modern architectures by introducing cache-awareness, thus significantly improving the guidelines for application optimization. The proposed model was experimentally verified for different architectures by taking advantage of built-in hardware counters with a curve fitness above 90\%.", acknowledgement = ack-nhfb, affiliation = "Ilic, A (Reprint Author), Univ Tecn Lisboa, INESC ID IST, Lisbon, Portugal. Ilic, Aleksandar; Pratas, Frederico; Sousa, Leonel, Univ Tecn Lisboa, INESC ID IST, Lisbon, Portugal.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "ilic@inesc-id.pt fcpp@inesc-id.pt las@inesc-id.pt", da = "2019-06-20", doc-delivery-number = "AT5MU", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "national funds through FCT (Fundacao para a Ciencia e a Tecnologia) [PTDC/EEI-ELC/3152/2012, PEst-OE/EEI/LA0021/2011, PTDC/EEA-ELC/117329/2010]; FCT [SFRH/BPD/87734/2012]", funding-text = "This work was supported by national funds through FCT (Fundacao para a Ciencia e a Tecnologia), under projects PTDC/EEI-ELC/3152/2012, PEst-OE/EEI/LA0021/2011, and PTDC/EEA-ELC/117329/2010. F. Pratas also acknowledges the FCT scholarship SFRH/BPD/87734/2012.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Application optimization; application optimization; Application optimization; built-in hardware counters; C.0.d Modeling of computer architecture < C.0 General < C Computer Systems Organization; C.0.e System architectures; C.4.d Modeling techniques < C.4 Performance of Systems < C Computer Systems Organization; C.4.g Measurement; cache storage; cache-aware Roofline model; cache-awareness; computer architecture; computer architecture upper bound performance; curve fitness; evaluation; integration and modeling < C.0 General < C Computer Systems Organization; Modeling; modeling; Multicore computer architectures; Multiprocessing systems; multiprocessing systems; Performance evaluation; Performance modeling; Simulation; simulation of multiple-processor systems < C.4 Performance of Systems < C Computer Syst", number-of-cited-references = "10", ORCID-numbers = "Ilic, Aleksandar/0000-0002-8594-3539 Sousa, Leonel/0000-0002-8066-221X", research-areas = "Computer Science", researcherid-numbers = "Ilic, Aleksandar/L-1943-2014 Sousa, Leonel/B-2749-2009", times-cited = "24", unique-id = "Ilic:2014:CAR", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Efraim:2014:EAR, author = "Rotem Efraim and Ran Ginosar and C. Weiser and Avi Mendelson", title = "Energy Aware Race to Halt: a Down to {EARtH} Approach for Platform Energy Management", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "25--28", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.32", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The EARtH algorithm finds the optimal voltage and frequency operational point of the processor in order to achieve minimum energy of the computing platform. The algorithm is based on a theoretical model employing a small number of parameters, which are extracted from real systems using off-line and run-time methods. The model and algorithm have been validated on real systems using 45nm, 32nm and 22nm Intel (R) Core processors. The algorithm can save up to 44\% energy compared with the commonly used fixed frequency policies.", acknowledgement = ack-nhfb, affiliation = "Efraim, R (Reprint Author), Intel Corp, Santa Clara, CA 95051 USA. Efraim, Rotem, Intel Corp, Santa Clara, CA 95051 USA. Ginosar, Ran; Weiser, C.; Mendelson, Avi, Technion Israeli Inst Technol, Haifa, Israel.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "AT5MU", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Algorithm design and analysis; B Hardware; B.9 Power Management; B.9.2 Energy-aware systems; C Computer Systems Organization; C.4 Performance of Systems; C.5 Computer System Implementation; C.5.4 VLSI Systems; C.5.5 Servers; Computational modeling; Earth; EARtH algorithm; energy aware race to halt; Energy management; Energy measurement; fixed frequency policies; Frequency measurement; frequency operational point; Heterogeneous cores; Intel core processors; microprocessor chips; off-line methods; optimal voltage; platform energy management; power aware computing; Power Management; run-time methods; size 22 nm; size 32 nm; size 45 nm; Voltage measurement", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "9", unique-id = "Efraim:2014:EAR", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Cakmakci:2014:EVA, author = "Yaman {\c{C}}akmak{\c{c}}i and O{\u{g}}uz Ergin", title = "Exploiting Virtual Addressing for Increasing Reliability", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "29--32", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.2", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "A novel method to protect a system against errors resulting from soft errors occurring in the virtual address (VA) storing structures such as translation lookaside buffers (TLB), physical register file (PRF) and the program counter (PC) is proposed in this paper. The work is motivated by showing how soft errors impact the structures that store virtual page numbers (VPN). A solution is proposed by employing linear block encoding methods to be used as a virtual addressing scheme at link time. Using the encoding scheme to assign VPNs for VAs, it is shown that the system can tolerate soft errors using software with the help of the discussed decoding techniques applied to the page fault handler. The proposed solution can be used on all of the architectures using virtually indexed addressing. The main contribution of this paper is the decreasing of AVF for data TLB by 42.5\%, instruction TLB by 40.3\%, PC by 69.2\% and PRF by 33.3\%.", acknowledgement = ack-nhfb, affiliation = "{\c{C}}akmak{\c{c}}i, Y (Reprint Author), TOBB Univ Econ \& Technol, Dept Comp Engn, Ankara, Turkey. {\c{C}}akmak{\c{c}}i, Yaman; Ergin, O{\u{g}}uz, TOBB Univ Econ \& Technol, Dept Comp Engn, Ankara, Turkey.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "ycakmakci@etu.edu.tr oergin@etu.edu.tr", da = "2019-06-20", doc-delivery-number = "AT5MU", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Scientific and Technological Research Council of Turkey (TUBITAK) [112E004]", funding-text = "This work was supported in part by the Scientific and Technological Research Council of Turkey (TUBITAK) under Grant 112E004. The work is in the framework of COST ICT Action 1103 Manufacturable and Dependable Multicore Architectures at Nanoscale.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "AVF; B Hardware; B.3 Memory Structures; B.3.2 Design Styles; B.3.2.h Virtual memory; B.3.4 Reliability, Testing and Fault-Tolerance; buffer storage; decoding techniques; encoding; Fault tolerance; Hardware; linear block encoding methods; Memory management; page fault handler; PC; physical register file; PRF; program counter; soft errors; TLB; translation lookaside buffers; virtual address storing structures; virtual addressing; virtual addressing scheme; Virtual memory; virtual page numbers; virtually indexed addressing; VPN", keywords-plus = "SOFT ERRORS", number-of-cited-references = "10", ORCID-numbers = "Ergin, O{\u{g}}uz/0000-0003-2701-3787", research-areas = "Computer Science", researcherid-numbers = "Ergin, O{\u{g}}uz/E-5717-2010", times-cited = "1", unique-id = "Cakmakci:2014:EVA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Zhu:2014:EWC, author = "Yuhao Zhu and Aditya Srikanth and Jingwen Leng and Vijay Janapa Reddi", title = "Exploiting Webpage Characteristics for Energy-Efficient Mobile {Web} Browsing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "33--36", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.33", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Web browsing on mobile devices is undoubtedly the future. However, with the increasing complexity of webpages, the mobile device's computation capability and energy consumption become major pitfalls for a satisfactory user experience. In this paper, we propose a mechanism to effectively leverage processor frequency scaling in order to balance the performance and energy consumption of mobile web browsing. This mechanism explores the performance and energy tradeoff in webpage loading, and schedules webpage loading according to the webpages' characteristics, using the different frequencies. The proposed solution achieves 20.3\% energy saving compared to the performance mode, and improves webpage loading performance by 37.1\% compared to the battery saving mode.", acknowledgement = ack-nhfb, affiliation = "Zhu, YH (Reprint Author), Univ Texas Austin, Dept Elect \& Comp Engn, Austin, TX 78712 USA. Zhu, Yuhao; Srikanth, Aditya; Leng, Jingwen; Reddi, Vijay Janapa, Univ Texas Austin, Dept Elect \& Comp Engn, Austin, TX 78712 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "yzhu@utexas.edu aditya.srik@utexas.edu jingwen@utexas.edu vj@ece.utexas.edu", da = "2019-06-20", doc-delivery-number = "AT5MU", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "C Computer Systems Organization; C.2 Communication/Networking and Information Technology; C.2.8 Mobile Computing; Cascading style sheets; Cutoff; EDP; Energy; energy conservation; energy consumption; Energy consumption; energy-efficient mobile Web browsing; HTML; Internet; Load modeling; Loading; Market research; Mobile communication; mobile computing; mobile device computation capability; Performance; power aware computing; processor frequency scaling; user experience; Web page characteristics; Web page loading performance; Webpages", number-of-cited-references = "6", research-areas = "Computer Science", times-cited = "6", unique-id = "Zhu:2014:EWC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Morad:2014:GMO, author = "Amir Morad and Tomer Y. Morad and Leonid Yavits and Ran Ginosar and Uri Weiser", title = "Generalized {MultiAmdahl}: Optimization of Heterogeneous Multi-Accelerator {SoC}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "37--40", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.34", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Consider a workload comprising a consecutive sequence of program execution segments, where each segment can either be executed on general purpose processor or offloaded to a hardware accelerator. An analytical optimization framework based on MultiAmdahl framework and Lagrange multipliers, for selecting the optimal set of accelerators and for allocating resources among them under constrained area is proposed. Due to the practical implementation of accelerators, the optimal architecture under area constraints may exclude some of the accelerators. As the fraction of the workload that can be accelerated decreases, resources (e.g. area) may shift from accelerators into the general purpose processor. The framework can be extended in a number of ways, spanning from SoC partitioning, bandwidth to power distribution, energy and other constrained resources.", acknowledgement = ack-nhfb, affiliation = "Morad, A (Reprint Author), Technion Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa, Israel. Morad, Amir; Morad, Tomer Y.; Yavits, Leonid; Ginosar, Ran; Weiser, Uri, Technion Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa, Israel.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "amirm@tx.technion.ac.il tomerm@tx.technion.ac.il yavits@tx.technion.ac.il ran@ee.technion.ac.il uri.weiser@ee.technion.ac.il", da = "2019-06-20", doc-delivery-number = "AT5MU", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Acceleration; analytical optimization framework; Chip Multiprocessors; general purpose processor; generalized multiAmdhal framework; Hardware; hardware accelerator; heterogeneous multiaccelerator SoC partitioning; Lagrange multiplier; Mathematical model; Modeling of computer architecture; MultiAmdahl; Multicore processing; optimisation; Optimization; power distribution bandwidth; program execution segment; resource allocation; Resource management; System-on-a-chip; system-on-chip", number-of-cited-references = "8", research-areas = "Computer Science", times-cited = "4", unique-id = "Morad:2014:GMO", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kvatinsky:2014:MBM, author = "Shahar Kvatinsky and Yuval H. Nacson and Yoav Etsion and Eby G. Friedman and Avinoam Kolodny and Uri C. Weiser", title = "Memristor-Based Multithreading", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "41--44", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.3", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Switch on Event Multithreading (SoE MT, also known as coarse-grained MT and block MT) processors run multiple threads on a pipeline machine, while the pipeline switches threads on stall events (e.g., cache miss). The thread switch penalty is determined by the number of stages in the pipeline that are flushed of in-flight instructions. In this paper, Continuous Flow Multithreading (CFMT), a new architecture of SoE MT, is introduced. In CFMT, a multistate pipeline register (MPR) holds the microarchitectural state of multiple different threads within the execution pipeline stages, where only one thread is active at a time. The MPRs eliminate the need to flush in-flight instructions and therefore significantly improve performance. In recent years, novel memory technologies such as Resistive RAM (RRAM) and Spin Torque Transfer Magnetoresistive RAM (STT-MRAM), have been developed. All of these technologies are nonvolatile, store data as resistance, and can be described as ``memristors''. Memristors are power efficient, dense, and fast as compared to standard memory technologies such as SRAM, DRAM, and Flash. Memristors therefore provide the opportunity to place the MPRs physically within the pipeline stages. A performance analysis of CFMT is compared to conventional SoE MT processors, demonstrating up to a 2X performance improvement, while the operational mechanism, due to the use of memristors, is low power and low complexity as compared to conventional SoE MT processors.", acknowledgement = ack-nhfb, affiliation = "Kvatinsky, S (Reprint Author), Technion Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa, Israel. Kvatinsky, Shahar; Etsion, Yoav; Kolodny, Avinoam; Weiser, Uri C., Technion Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa, Israel. Etsion, Yoav, Technion Israel Inst Technol, Dept Comp Sci, IL-32000 Haifa, Israel. Friedman, Eby G., Univ Rochester, Dept Elect \& Comp Engn, Rochester, NY 14627 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "skva@tx.technion.ac.il", da = "2019-06-20", doc-delivery-number = "AT5MU", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Hasso Plattner Institute", funding-text = "This work was supported by the Hasso Plattner Institute. The authors thank Ravi Patel for his comments and area overhead estimation and to Nimrod Wald and Guy Satat for their help in evaluating the architecture.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "B Hardware; B.3 Memory Structures; B.7 Integrated Circuits; B.7.1 Types and Design Styles; B.7.1.e Memory technologies; C Computer Systems Organization; C.0 General; C.0.a Emerging technologies; C.0.d Modeling of computer architecture; CFMT; Computer architecture; continuous flow multithreading; in-flight instructions; Integrated circuits; Memory management; memristor; memristor-based multithreading; memristors; MPR; multi-threading; multistate pipeline register; multithreaded processors; Multithreading; novel memory technologies; phase change memory; random-access storage; resistive RAM; RRAM; RRAM, STT-MRAM; SoE MT processors; spin torque transfer magnetoresistive RAM; STT- MRAM; STT-MRAM; switch on event multithreading processors; Systems design and analysis", keywords-plus = "RESISTIVE SWITCHING MEMORIES", number-of-cited-references = "21", research-areas = "Computer Science", times-cited = "10", unique-id = "Kvatinsky:2014:MBM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Wingbermuehle:2014:OAS, author = "Joseph G. Wingbermuehle and Ron K. Cytron and Roger D. Chamberlain", title = "Optimization of Application-Specific Memories", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "45--48", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.7", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Memory access times are the primary bottleneck for many applications today. This ``memory wall'' is due to the performance disparity between processor cores and main memory. To address the performance gap, we propose the use of custom memory subsystems tailored to the application rather than attempting to optimize the application for a fixed memory subsystem. Custom subsystems can take advantage of application-specific properties as well as memory-specific properties to improve access times or write-backs given constraints on size or power.", acknowledgement = ack-nhfb, affiliation = "Wingbermuehle, JG (Reprint Author), Washington Univ, Dept Comp Sci \& Engn, St Louis, MO 63130 USA. Wingbermuehle, Joseph G.; Cytron, Ron K.; Chamberlain, Roger D., Washington Univ, Dept Comp Sci \& Engn, St Louis, MO 63130 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "wingbej@wustl.edu cytron@wustl.edu roger@wustl.edu", da = "2019-06-20", doc-delivery-number = "AT5MU", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Science Foundation [CNS-09095368, CNS-0931693]", funding-text = "This work is supported by the National Science Foundation under grants CNS-09095368 and CNS-0931693.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "access time improvement; application-specific memory optimization; B Hardware; B.3 Memory Structures; B.3.2 Design Styles; B.3.3 Performance Analysis and Design Aids; B.3.3.b Simulation; C Computer Systems Organization; C.1 Processor Architectures; C.1.5 Micro-architecture implementation considerations; C.1.5.e Memory hierarchy; cache; cache storage; Computer architecture; custom memory subsystems; fixed memory subsystem; Hardware; memory access times; Memory management; memory wall; memory-specific properties; Multiprocessing systems; performance disparity; Performance evaluation; performance gap; processor cores; write-backs given constraints", number-of-cited-references = "21", research-areas = "Computer Science", times-cited = "1", unique-id = "Wingbermuehle:2014:OAS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Xu:2014:STM, author = "Yunlong Xu and Rui Wang and Nilanjan Goswami and Tao Li and Depei Qian", title = "Software Transactional Memory for {GPU} Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "49--52", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.4", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "To make applications with dynamic data sharing among threads benefit from GPU acceleration, we propose a novel software transactional memory system for GPU architectures (GPU-STM). The major challenges include ensuring good scalability with respect to the massively multithreading of GPUs, and preventing livelocks caused by the SIMT execution paradigm of GPUs. To this end, we propose (1) a hierarchical validation technique and (2) an encounter-time lock-sorting mechanism to deal with the two challenges, respectively. Evaluation shows that GPU-STM outperforms coarse-grain locks on GPUs by up to 20x.", acknowledgement = ack-nhfb, affiliation = "Xu, YL (Reprint Author), Xi An Jiao Tong Univ, Sch Elect \& Informat Engn, Xian 710049, Peoples R China. Xu, Yunlong; Qian, Depei, Xi An Jiao Tong Univ, Sch Elect \& Informat Engn, Xian 710049, Peoples R China. Wang, Rui; Qian, Depei, Beihang Univ, Sch Engn \& Comp Sci, Beijing, Peoples R China. Goswami, Nilanjan; Li, Tao, Univ Florida, ECE Dept, Gainesville, FL USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "xjtu.ylxu@stu.xjtu.edu.cn rui.wang@jsi.buaa.edu.cn nil@ufl.edu taoli@ece.ufl.edu depeiq@xjtu.edu.cn", da = "2019-06-20", doc-delivery-number = "AT5MU", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF of China [61133004, 61128004, 61073011]; 863 Program of China [2012AA010902]", funding-text = "This work is supported by NSF of China under grant 61133004, 61128004 and 61073011, and 863 Program of China under grant 2012AA010902.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "dynamic data sharing; encounter-time lock-sorting mechanism; GPU acceleration; GPU architectures; GPU-STM; graphics processing units; hierarchical validation technique; multi-threading; Multicore processing; multicore processor; Multicore Processors; multiprocessing systems; Multiprocessing systems; multithreading; parallel architectures; Parallel processing; Parallel Programming; parallel programming; Parallel Programming; Run-time Environments; Runtime environment; SIMD processor; SIMD Processors; SIMT execution paradigm; software transactional memory system; sorting", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "1", unique-id = "Xu:2014:STM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Shim:2014:TMP, author = "Keun Sup Shim and Mieszko Lis and Omer Khan and Srinivas Devadas", title = "Thread Migration Prediction for Distributed Shared Caches", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "53--56", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.30", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Chip-multiprocessors (CMPs) have become the mainstream parallel architecture in recent years; for scalability reasons, designs with high core counts tend towards tiled CMPs with physically distributed shared caches. This naturally leads to a Non-Uniform Cache Access (NUCA) design, where on-chip access latencies depend on the physical distances between requesting cores and home cores where the data is cached. Improving data locality is thus key to performance, and several studies have addressed this problem using data replication and data migration. In this paper, we consider another mechanism, hardware-level thread migration. This approach, we argue, can better exploit shared data locality for NUCA designs by effectively replacing multiple round-trip remote cache accesses with a smaller number of migrations. High migration costs, however, make it crucial to use thread migrations judiciously; we therefore propose a novel, on-line prediction scheme which decides whether to perform a remote access (as in traditional NUCA designs) or to perform a thread migration at the instruction level. For a set of parallel benchmarks, our thread migration predictor improves the performance by 24\% on average over the shared-NUCA design that only uses remote accesses.", acknowledgement = ack-nhfb, affiliation = "Shim, KS (Reprint Author), MIT, 77 Massachusetts Ave, Cambridge, MA 02139 USA. Shim, Keun Sup; Lis, Mieszko; Devadas, Srinivas, MIT, Cambridge, MA 02139 USA. Khan, Omer, Univ Connecticut, Storrs, CT USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "AT5MU", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "B Hardware; B.3 Memory Structures; B.3.2 Design Styles; B.3.2.g Shared memory; Benchmark testing; C Computer Systems Organization; C.1 Processor Architectures; C.1.4 Parallel Architectures; Cache Coherence; cache storage; chip-multiprocessors; CMPs; Coherence; Computer architecture; Context; core counts; Data Locality; data locality improvement; data migration; data replication; Distributed Caches; hardware-level thread migration prediction; home cores; Instruction sets; integrated circuit design; mainstream parallel architecture; microprocessor chips; multiprocessing systems; nonuniform cache access design; on-chip access latencies; online prediction scheme; Parallel Architecture; parallel architectures; physical distributed shared caches; Protocols; Registers; requesting cores; shared-NUCA design", number-of-cited-references = "13", oa = "Green Published", research-areas = "Computer Science", times-cited = "4", unique-id = "Shim:2014:TMP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2014:TCa, author = "Anonymous", title = "Table of Contents", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "C1--C4", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2360655", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2014:ITPa, author = "Anonymous", title = "{{\booktitle{IEEE Transactions on Pattern Analysis and Machine Intelligence}} Editorial Board}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "C2--C2", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2360656", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2014:ITPb, author = "Anonymous", title = "{{\booktitle{IEEE Transactions on Pattern Analysis and Machine Intelligence}}} Information for Authors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "C3--C3", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2360657", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2014:ICSa, author = "Anonymous", title = "{IEEE Computer Society}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "C4--C4", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2360658", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Lavasani:2014:FBL, author = "Maysam Lavasani and Hari Angepat and Derek Chiou", title = "An {FPGA}-based In-Line Accelerator for {Memcached}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "57--60", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.17", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We present a method for accelerating server applications using a hybrid CPU+FPGA architecture and demonstrate its advantages by accelerating Memcached, a distributed key-value system. The accelerator, implemented on the FPGA fabric, processes request packets directly from the network, avoiding the CPU in most cases. The accelerator is created by profiling the application to determine the most commonly executed trace of basic blocks which are then extracted. Traces are executed speculatively within the FPGA. If the control flow exits the trace prematurely, the side effects of the computation are rolled back and the request packet is passed to the CPU. When compared to the best reported software numbers, the Memcached accelerator is 9.15x more energy efficient for common case requests.", acknowledgement = ack-nhfb, affiliation = "Lavasani, M (Reprint Author), Univ Texas Austin, Dept Elect \& Comp Engn, Austin, TX 78712 USA. Lavasani, Maysam; Angepat, Hari; Chiou, Derek, Univ Texas Austin, Dept Elect \& Comp Engn, Austin, TX 78712 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "maysamlavasani@utexas.edu hangepat@utexas.edu derek@utexas.edu", da = "2019-06-20", doc-delivery-number = "AX5PM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "accelerating server; C.1.3.f Heterogeneous (hybrid) systems; C.2.4.a Client/server; cache storage; Client-server systems; Computer architecture; control flow; distributed key-value system; distributed processing; field programmable gate arrays; Field programmable gate arrays; FPGA-based in-line accelerator; hybrid CPU+FPGA architecture; Hybrid systems; Memcached accelerator; Program processors; reconfigurable architectures; request packet; rolled back; software numbers", number-of-cited-references = "17", research-areas = "Computer Science", times-cited = "24", unique-id = "Lavasani:2014:FBL", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Song:2014:AFB, author = "Xiang Song and Jian Yang and Haibo Chen", title = "Architecting Flash-based Solid-State Drive for High-performance {I/O} Virtualization", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "61--64", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.22", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Flash-based solid-state drive (SSD) is now being widely deployed in cloud computing platforms due to the potential advantages of better performance and less energy consumption. However, current virtualization architecture lacks support for high-performance I/O virtualization over persistent storage, which results in sub-optimal I/O performance for guest virtual machines (VMs) on SSD. Further, current software-based I/O virtualization violates the ``don't hide power'' principle due to inefficient support for some advanced SSD commands (e.g., TRIM) and constrained parallelism, leading to sub-optimal performance and life cycle. This paper observes that the massive internal parallelism and the block emulation in the flash translation layer (FTL) make flash-based SSD an ideal candidate to support high-performance I/O virtualization for persistent storage. Based on this observation, we propose VFlash, the first storage I/O virtualization architecture that extends existing SSDs with trivial hardware changes to directly expose multiple virtual SSDs to guest VMs. Performance evaluation using a modified FlashSim with two FTL schemes (i.e., DFTL and FAST) shows that VFlash incurs only small performance overhead over native SSDs and can efficiently exploit parallelism.", acknowledgement = ack-nhfb, affiliation = "Chen, HB (Reprint Author), Shanghai Jiao Tong Univ, Sch Software, Inst Parallel \& Distributed Syst, Shanghai 200030, Peoples R China. Song, Xiang; Yang, Jian; Chen, Haibo, Shanghai Jiao Tong Univ, Sch Software, Inst Parallel \& Distributed Syst, Shanghai 200030, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "haibochen@sjtu.edu.cn", da = "2019-06-20", doc-delivery-number = "AX5PM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "China National Natural Science Foundation [61003002]; Intel", funding-text = "This work was supported by China National Natural Science Foundation under grant numbered 61003002 and a grant from Intel.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "B.4.4 Performance Analysis and Design Aids; C.4.g Measurement; cloud computing; Cloud computing; cloud computing platforms; Computer architecture; energy consumption; evaluation; flash memories; flash-based solid-state drive; high performance I/O virtualization architecture; I/O virtualization; modeling; Multiprocessing systems; Parallel processing; Performance evaluation; performance evaluation; Random access memory; simulation of multiple-processor systems; software-based I/O virtualization; Solid state circuits; Solid State Drive; SSD commands; virtual machines; virtualisation; VM", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "6", unique-id = "Song:2014:AFB", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Wu:2014:ATE, author = "Carole-Jean Wu", title = "Architectural Thermal Energy Harvesting Opportunities for Sustainable Computing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "65--68", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.16", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Increased power dissipation in computing devices has led to a sharp rise in thermal hotspots, creating thermal runaway. To reduce the additional power requirement caused by increased temperature, current approaches apply cooling mechanisms to remove heat or apply management techniques to avoid thermal emergencies by slowing down heat generation. This paper proposes to tackle the heat management problem of computing platforms with a fundamentally new approach - instead of heat removal using cooling mechanisms and heat avoidance using dynamic thermal/power management techniques, this work investigates the mechanisms to recover wasted heat into reusable energy for sustainable computing. Through recent advancements in thermoelectric materials, we allow wasted heat energy generated by computing devices to be recovered, transformed, and harvested as electricity that can be directly used within the system. We demonstrate a real-system setup where we recover 0.3 to 1 watt of power with the CPU running at 70 to 105 degrees C, using a COTS thermoelectric device on top of the CPU. Through this research, we hope to motivate more in-depth efforts to explore heat energy harvesting opportunities on computing devices and inspire plausible solutions to overcome the technical challenges discussed in this paper.", acknowledgement = ack-nhfb, affiliation = "Wu, CJ (Reprint Author), Arizona State Univ, Sch Comp, Dept Comp Sci Engn, Tempe, AZ 85281 USA. Arizona State Univ, Sch Comp, Dept Comp Sci Engn, Tempe, AZ 85281 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "carole-jean.wu@asu.edu", da = "2019-06-20", doc-delivery-number = "AX5PM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "architectural thermal energy harvesting; cooling; Cooling; cooling mechanisms; dynamic thermal-power management technique; Energy conservation; energy harvesting; Energy-aware systems; heat generation; heat management problem; power dissipation; Power distribution; power engineering computing; Resistance heating; sustainable computing; Temperature measurement; Temperature-aware design; thermal energy storage; thermal runaway; Waste heat", number-of-cited-references = "6", research-areas = "Computer Science", times-cited = "5", unique-id = "Wu:2014:ATE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Yavits:2014:CHO, author = "Leonid Yavits and Amir Morad and Ran Ginosar", title = "Cache Hierarchy Optimization", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "69--72", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.18", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Power consumption, off-chip memory bandwidth, chip area and Network on Chip (NoC) capacity are among main chip resources limiting the scalability of Chip Multiprocessors (CMP). A closed form analytical solution for optimizing the CMP cache hierarchy and optimally allocating area among hierarchy levels under such constrained resources is developed. The optimization framework is extended by incorporating the impact of data sharing on cache miss rate. An analytical model for cache access time as a function of cache size is proposed and verified using CACTI simulation.", acknowledgement = ack-nhfb, affiliation = "Yavits, L (Reprint Author), Technion Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa, Israel. Yavits, Leonid; Morad, Amir; Ginosar, Ran, Technion Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa, Israel.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "AX5PM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "ICRI-CI; Hasso-Plattner-Institut", funding-text = "We thank Prof. Uri Weiser and Yaniv Ben Itzhak for their review and remarks. This research was partially funded by the ICRI-CI and Hasso-Plattner-Institut.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Analytical models; Analytical Performance Models; Bandwidth; Cache Hierarchy; cache hierarchy optimization; cache storage; CACTI simulation; chip area; Chip Multiprocessor; chip multiprocessors; CMP; Computational modeling; data sharing; Integrated circuit modeling; Multiprocessing systems; network on chip; network-on-chip; NoC; off-chip memory bandwidth; optimisation; Optimization; power consumption; Resource Allocation Optimization; Resource Allocation Optimizations; Resource management", number-of-cited-references = "17", research-areas = "Computer Science", times-cited = "1", unique-id = "Yavits:2014:CHO", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Yazdanshenas:2014:CLL, author = "Sadegh Yazdanshenas and Marzieh Ranjbar Pirbasti and Mahdi Fazeli and Ahmad Patooghy", title = "Coding Last Level {STT-RAM} Cache For High Endurance And Low Power", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "73--76", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.8", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "STT-RAM technology has recently emerged as one of the most promising memory technologies. However, its major problems, limited write endurance and high write energy, are still preventing it from being used as a drop-in replacement of SRAM cache. In this paper, we propose a novel coding scheme for STT-RAM last level cache based on the concept of value locality. We reduce switching probability in cache by swapping common patterns with limited weight codes (LWC) to make writes less often as well as more uniform. We also define some policies for swapping these patterns. Our evaluation shows that bit write variance in memory cells can be reduced by about 20\% on average resulting in a more uniform wear-out directly enhancing lifetime and improving cell reliability. In addition, writes in cache lines can be reduced by about 12\% compared to one of the most effective circuit level techniques known as early write termination (EWT) [12]. Our method increases memory hierarchy access time by about 0.08\% on average, which is negligible. We have shown that our method doesn't adversely affect last level cache energy-delay(2). The non-uniformity caused by the coding scheme can be used for another coding scheme at main memory or L1 cache depending on their technologies.", acknowledgement = ack-nhfb, affiliation = "Yazdanshenas, S (Reprint Author), Iran Univ Sci \& Technol, Sch Comp Engn, Tehran, Iran. Yazdanshenas, Sadegh; Pirbasti, Marzieh Ranjbar; Fazeli, Mahdi; Patooghy, Ahmad, Iran Univ Sci \& Technol, Sch Comp Engn, Tehran, Iran.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "sadegh\_yazdanshenas@comp.iust.ac.ir m\_ranjbar@comp.iust.ac.ir m\_fazeli@iust.ac.ir patooghy@iust.ac.ir", da = "2019-06-20", doc-delivery-number = "AX5PM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "B Hardware; B.3 Memory Structures; bit write variance; C Computer Systems Organization; C.1 Processor Architectures; cache; cache storage; cell reliability; circuit level technique; coding scheme; Computer architecture; early write termination; Encoding; limited weight code; limited weight codes; memory endurance; memory technology; nonvolatile memory; Nonvolatile memory; probability; Random access memory; random-access storage; STT-RAM; STT-RAM cache; switching probability; Three-dimensional displays; write energy; write hotspot", keywords-plus = "MEMORY; CIRCUIT; ENERGY; MRAM", number-of-cited-references = "13", ORCID-numbers = "Fazeli, Mahdi/0000-0002-2874-6256 Patooghy, Ahmad/0000-0003-2647-2797", research-areas = "Computer Science", researcherid-numbers = "Fazeli/S-9574-2018", times-cited = "14", unique-id = "Yazdanshenas:2014:CLL", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Martinsen:2014:HTL, author = "Jan Kasper Martinsen and Hakan Grahn and Anders Isberg", title = "Heuristics for Thread-Level Speculation in {Web} Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "77--80", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.26", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/java2010.bib", abstract = "JavaScript is a sequential programming language, and Thread-Level Speculation has been proposed to dynamically extract parallelism in order to take advantage of parallel hardware. In previous work, we have showed significant speed-ups with a simple on/off speculation heuristic. In this paper, we propose and evaluate three heuristics for dynamically adapt the speculation: a 2-bit heuristic, an exponential heuristic, and a combination of these two. Our results show that the combined heuristic is able to both increase the number of successful speculations and decrease the execution time for 15 popular web applications.", acknowledgement = ack-nhfb, affiliation = "Martinsen, JK (Reprint Author), Blekinge Inst Technol, Sch Comp, SE-37179 Karlskrona, Sweden. Martinsen, Jan Kasper; Grahn, Hakan, Blekinge Inst Technol, Sch Comp, SE-37179 Karlskrona, Sweden. Isberg, Anders, Sony Mobile Commun AB, SE-22188 Lund, Sweden.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "Jan.Kasper.Martinsen@bth.se Hakan.Grahn@bth.se Anders.Isberg@sonymobile.com", da = "2019-06-20", doc-delivery-number = "AX5PM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Industrial Excellence Center EASE - Embedded Applications Software Engineering; BESQ+ research project --- Knowledge Foundation in Sweden [20100311]", funding-text = "This work was partly funded by the Industrial Excellence Center EASE --- Embedded Applications Software Engineering, (http://ease.cs.lth.se), and the BESQ+ research project funded by the Knowledge Foundation (grant number 20100311) in Sweden.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "2-bit heuristic; Automatic Parallelization; Benchmark testing; C.1.4 Parallel Architectures; C.1.4.f Speculative multi-threading; exponential heuristic; Instruction sets; Internet; Java; JavaScript; Multicore processors; Multithreading; Parallel Computing; parallel hardware; Parallel processing; parallel programming; sequential programming language; Social network services; thread-level speculation; Web applications", number-of-cited-references = "12", oa = "Green Published", ORCID-numbers = "Martinsen, Jan Kasper/0000-0001-8915-3633 Grahn, Hakan/0000-0001-9947-1088", research-areas = "Computer Science", times-cited = "2", unique-id = "Martinsen:2014:HTL", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Nandakumar:2014:OKS, author = "Vivek S. Nandakumar and Ma{\l}gorzata Marek-Sadowska", title = "On Optimal Kernel Size for Integrated {CPU--GPUs} --- a Case Study", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "81--84", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.27", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Integrated CPU-GPU architectures with a fully addressable shared memory completely eliminate any CPU-GPU data transfer overhead. Since such architectures are relatively new, it is unclear what level of interaction between the CPU and GPU attains the best energy efficiency. Too coarse grained or larger kernels with fairly low CPU--GPU interaction could cause poor utilization of the shared resources while too fine grained kernels could cause frequent interrupts of GPU computation and performance degradation. Also larger kernels require larger shared resources causing increase in area and parasitics which affect the latency sensitive CPU cores. In this paper, we show the effect of granularity on the overall system's energy efficiency using a synthetic workload. We describe how our framework models a truly unified shared memory in integrated architectures with frequent CPU--GPU communication.", acknowledgement = ack-nhfb, affiliation = "Nandakumar, VS (Reprint Author), Univ Calif Santa Barbara, Dept Elect \& Comp Engn, Santa Barbara, CA 93106 USA. Nandakumar, Vivek S.; Marek-Sadowska, Malgorzata, Univ Calif Santa Barbara, Dept Elect \& Comp Engn, Santa Barbara, CA 93106 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "vivek@ece.ucsb.edu mms@ece.uscb.edu", da = "2019-06-20", doc-delivery-number = "AX5PM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "SRC grant [2236]", funding-text = "This work was supported by SRC grant \#2236.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "B.3.2.g Shared memory; B.4.4.b Simulation; B.9.2 Energy-aware systems; C.1.3.f Heterogeneous (hybrid) systems; C.4.g Measurement; Central Processing Unit; Computational modeling; CPU-GPU communication; CPU-GPU data transfer overhead; CPU-GPU interaction; D.4.4 Communications Management; energy efficiency; Energy efficiency; evaluation; fine grained kernels; fully addressable shared memory; GPU computation; graphics processing units; Graphics processing units; integrated CPU-GPU architectures; latency sensitive CPU cores; Memory management; modeling; optimal kernel size; overall system energy efficiency; performance degradation; performance evaluation; power aware computing; shared memory systems; simulation of multiple-processor systems", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "0", unique-id = "Nandakumar:2014:OKS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Liu:2014:PTE, author = "Qixiao Liu and Victor Jimenez and Miquel Moreto and Jaume Abella and Francisco J. Cazorla and Mateo Valero", title = "Per-task Energy Accounting in Computing Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "85--88", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.24", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We present for the first time the concept of per-task energy accounting (PTEA) and relate it to per-task energy metering (PTEM). We show the benefits of supporting both in future computing systems. Using the shared last-level cache (LLC) as an example: (1) We illustrate the complexities in providing PTEM and PTEA; (2) we present an idealized PTEM model and an accurate and low-cost implementation of it; and (3) we introduce a hardware mechanism to provide accurate PTEA in the cache.", acknowledgement = ack-nhfb, affiliation = "Liu, QX (Reprint Author), Univ Politecn Cataluna, E-08028 Barcelona, Spain. Liu, Qixiao; Jimenez, Victor; Moreto, Miquel; Valero, Mateo, Univ Politecn Cataluna, E-08028 Barcelona, Spain. Liu, Qixiao; Jimenez, Victor; Moreto, Miquel; Abella, Jaume; Cazorla, Francisco J.; Valero, Mateo, Barcelona Supercomp Ctr, Barcelona, Spain. Cazorla, Francisco J., Spanish Natl Res Council IIIA CSIC, Barcelona, Spain.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "AX5PM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Spanish Ministry of Science and Innovation [TIN2012-34557]; HiPEAC Network of Excellence; Chinese Scholarship Council [2010608015]", funding-text = "This work has been partially supported by the Spanish Ministry of Science and Innovation under grant TIN2012-34557 and the HiPEAC Network of Excellence. Qixiao Liu has also been funded by the Chinese Scholarship Council under grant 2010608015.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; cache storage; Computational modeling; computing systems; Energy consumption; Energy management; Monitoring; Multicore processing; per-task energy accounting; per-task energy metering; power aware computing; PTEA; PTEM model; Radiation detectors; shared last-level cache", number-of-cited-references = "20", oa = "Green Published", ORCID-numbers = "Cazorla, Francisco/0000-0002-3344-376X Moreto Planas, Miquel/0000-0002-9848-8758 Valero, Mateo/0000-0003-2917-2482 Abella, Jaume/0000-0001-7951-4028 Liu, Qixiao/0000-0002-8196-7584", research-areas = "Computer Science", researcherid-numbers = "Cazorla, Francisco/D-7261-2016 Moreto Planas, Miquel/C-1823-2016 Valero, Mateo/L-5709-2014 Abella, Jaume/B-7422-2016", times-cited = "2", unique-id = "Liu:2014:PTE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Mahmoodi:2014:RCC, author = "Hamid Mahmoodi and Sridevi Srinivasan Lakshmipuram and Manish Arora and Yashar Asgarieh and Houman Homayoun and Bill Lin and Dean M. Tullsen", title = "Resistive Computation: a Critique", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "89--92", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.23", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Resistive Computation was suggested by [6] as an idea for tacking the power wall by replacing conventional CMOS logic with Magnetic Tunnel Junction (MTJ) based Look-Up Tables (LUTs). Spin Transfer Torque RAM (STTRAM) is an emerging CMOS-compatible non-volatile memory technology based on Magnetic Tunnel Junctions as a memory bit [3]. The principal advantage of STTRAM is that it is leakage-resistant, which is an important characteristic beyond the 45nm technology node, where leakage concerns are becoming a limiting factor in microprocessor performance. Although STTRAM is a good candidate for replacing SRAM for on-chip memory, we argue in this article MTJ-based LUTs are unnecessarily expensive in terms of area, power, and performance when implementing fixed combinational logic that does not require the reprogramming ability provided by MTJs.", acknowledgement = ack-nhfb, affiliation = "Mahmoodi, H (Reprint Author), San Francisco State Univ, San Francisco, CA 94132 USA. Arora, Manish; Asgarieh, Yashar; Lin, Bill; Tullsen, Dean M., Univ Calif San Diego, La Jolla, CA 92093 USA. Mahmoodi, Hamid; Lakshmipuram, Sridevi Srinivasan, San Francisco State Univ, San Francisco, CA 94132 USA. Homayoun, Houman, George Mason Univ, Fairfax, VA 22030 USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "AX5PM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "B.2.1 Design Styles; B.6.1.e Memory used as logic; B.7.1.a Advanced technologies; B.9.1 Low-power design; C.0.a Emerging technologies; CMOS integrated circuits; CMOS-compatible nonvolatile memory technology; Delays; dynamic current-mode logic; fixed combinational logic; leakage power; leakage-resistance; Logic gates; look-up tables; Low power electronics; magnetic tunnel junction; Magnetic tunneling; magnetic tunnelling; magnetic-tunnel junctions; memory bit; MRAM; MTJ-based LUT; Power distribution; random-access storage; Resistive computation; resistive computation; Resistive computation; spin transfer torque RAM; STTRAM; Table lookup; table lookup; Transistors", keywords-plus = "TECHNOLOGY; CIRCUIT", number-of-cited-references = "10", ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X", research-areas = "Computer Science", researcherid-numbers = "Lin, Binshan/A-9772-2009", times-cited = "4", unique-id = "Mahmoodi:2014:RCC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Eyerman:2014:RCW, author = "Stijn Eyerman and Lieven Eeckhout", title = "Restating the Case for Weighted-{IPC} Metrics to Evaluate Multiprogram Workload Performance", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "93--96", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.9", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Weighted speedup is nowadays the most commonly used multiprogram workload performance metric. Weighted speedup is a weighted-IPC metric, i.e., the multiprogram IPC of each program is first weighted with its isolated IPC. Recently, Michaud questions the validity of weighted-IPC metrics by arguing that they are inconsistent and that weighted speedup favors unfairness [4]. Instead, he advocates using the arithmetic or harmonic mean of the raw IPC values of the programs in the multiprogram workload. We show that weighted-IPC metrics are not inconsistent, and that weighted speedup is fair in giving equal importance to each program. We argue that, in contrast to raw-IPC metrics, weighted-IPC metrics have a system-level meaning, and that raw-IPC metrics are affected by the inherent behavior of the programs. We also show that the choice of a metric may adversely affect the conclusions from an experiment. We suggest to use two weighted-IPC metrics-system throughput (STP) and average normalized turnaround time (ANTT)-for evaluating multiprogram workload performance, and to avoid raw-IPC metrics.", acknowledgement = ack-nhfb, affiliation = "Eyerman, S (Reprint Author), Univ Ghent, B-9000 Ghent, Belgium. Eyerman, Stijn; Eeckhout, Lieven, Univ Ghent, B-9000 Ghent, Belgium.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "AX5PM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Research Foundation --- Flanders (FWO); European Research Council under the European Community [259295]", funding-text = "Stijn Eyerman is supported through a postdoctoral fellowship by the Research Foundation --- Flanders (FWO). Additional support is provided by the European Research Council under the European Community's Seventh Framework Programme (FP7/2007-2013) / ERC Grant agreement no. 259295.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "ANTT; average normalized turnaround time; Benchmark testing; C Computer Systems Organization; C.1 Processor Architectures; C.1.3 Other Architecture Styles; C.1.3.h Multithreaded processors; C.1.4 Parallel Architectures; C.1.4.e Multi-core/single-chip multiprocessors; C.4 Performance of Systems; C.4.c Measurement techniques; Degradation; Harmonic analysis; harmonic mean; Multicore processing; multiprocessing systems; multiprogram IPC; multiprogram workload performance metric; multiprogramming; raw-IPC metrics; STP; system throughput; system-level meaning; Throughput; Weight measurement; weighted speedup; weighted-IPC metric", number-of-cited-references = "6", research-areas = "Computer Science", times-cited = "9", unique-id = "Eyerman:2014:RCW", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Wolff:2014:RUR, author = "Sonya R. Wolff and Ronald D. Barnes", title = "Revisiting Using the Results of Pre-Executed Instructions in Runahead Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "97--100", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.21", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Long-latency cache accesses cause significant performance-impacting delays for both in-order and out-of-order processor systems. To address these delays, runahead pre-execution has been shown to produce speedups by warming-up cache structures during stalls caused by long-latency memory accesses. While improving cache related performance, basic runahead approaches do not otherwise utilize results from accurately pre-executed instructions during normal operation. This simple model of execution is potentially inefficient and performance constraining. However, a previous study showed that exploiting the results of accurately pre-executed runahead instructions for out-of-order processors provide little performance improvement over simple re-execution. This work will show that, unlike out-of-order runahead architectures, the performance improvement from runahead result use for an in-order pipeline is more significant, on average, and in some situations provides dramatic performance improvements. For a set of SPEC CPU2006 benchmarks which experience performance improvement from basic runahead, the addition of result use to the pipeline provided an additional speedup of 1.14X (high --- 1.48X) for an in-order processor model compared to only 1.05X (high --- 1.16X) for an out-of-order one. When considering benchmarks with poor data cache locality, the average speedup increased to 1.21X for in-order compared to only 1.10X for out-of-order.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "AX5PM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; C.1.5.c Superscalar dynamically-scheduled and statically-scheduled implementation; C.1.5.e Memory hierarchy; cache storage; data cache locality; Hidden Markov models; in-order processor systems; long-latency cache accesses; long-latency memory accesses; Memory Wall; multiprocessing systems; Out of order; out-of-order processor systems; out-of-order runahead architectures; Pipeline processing; Pre-Execution; preexecuted runahead instructions; Registers; Runahead; runahead processors; SPEC CPU2006 benchmarks", keywords-plus = "PIPELINES", number-of-cited-references = "20", research-areas = "Computer Science", times-cited = "0", unique-id = "Wolff:2014:RUR", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kim:2014:SGA, author = "Youngsok Kim and Jaewon Lee and Donggyu Kim and Jangwoo Kim", title = "{ScaleGPU}: {GPU} Architecture for Memory-Unaware {GPU} Programming", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "101--104", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.19", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Programmer-managed GPU memory is a major challenge in writing GPU applications. Programmers must rewrite and optimize an existing code for a different GPU memory size for both portability and performance. Alternatively, they can achieve only portability by disabling GPU memory at the cost of significant performance degradation. In this paper, we propose ScaleGPU, a novel GPU architecture to enable high-performance memory-unaware GPU programming. ScaleGPU uses GPU memory as a cache of CPU memory to provide programmers a view of CPU memory-sized programming space. ScaleGPU also achieves high performance by minimizing the amount of CPU-GPU data transfers and by utilizing the GPU memory's high bandwidth. Our experiments show that ScaleGPU can run a GPU application on any GPU memory size and also improves performance significantly. For example, ScaleGPU improves the performance of the hotspot application by similar to 48\% using the same size of GPU memory and reduces its memory size requirement by similar to 75\% maintaining the target performance.", acknowledgement = ack-nhfb, affiliation = "Kim, Y (Reprint Author), POSTECH, Dept Comp Sci \& Engn, Pohang, South Korea. Kim, Youngsok; Lee, Jaewon; Kim, Donggyu; Kim, Jangwoo, POSTECH, Dept Comp Sci \& Engn, Pohang, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "elixir@postech.ac.kr spiegel0@postech.ac.kr vteori@postech.ac.kr jangwoo@postech.ac.kr", da = "2019-06-20", doc-delivery-number = "AX5PM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Basic Science Research Program through the National Research Foundation of Korea (NRF) --- Ministry of Education, Science and Technology [2011-0014817]; NRF Grant --- Korean Government (NRF-Global Ph.D. Fellowship Program)", funding-text = "This research was supported by Basic Science Research Program through the National Research Foundation of Korea (NRF) funded by the Ministry of Education, Science and Technology (2011-0014817) and NRF Grant funded by the Korean Government (NRF-2012-Global Ph.D. Fellowship Program).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "C.1.2.j SIMD processors; C.1.4.e Multi-core/single-chip multiprocessors; C.1.5.e Memory hierarchy; cache; cache storage; code rewrite; CPU memory-sized programming space; CPU-GPU data transfers; Data transfer; GPU applications; GPU architecture; GPU memory high bandwidth; GPU memory size; graphics processing units; Graphics processing units; graphics processing units; high-performance memory-unaware GPU programming; I.3.1.a Graphics processors; Instruction sets; memory architecture; Memory management; memory size requirement; programmer-managed GPU memory; Programming; Random access memory; ScaleGPU", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "7", unique-id = "Kim:2014:SGA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Sankar:2014:SFL, author = "Sriram Sankar and Sudhanva Gurumurthi", title = "Soft Failures in Large Datacenters", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "105--108", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.25", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "A major problem in managing large-scale datacenters is diagnosing and fixing machine failures. Most large datacenter deployments have a management infrastructure that can help diagnose failure causes, and manage assets that were fixed as part of the repair process. Previous studies identify only actual hardware replacements to calculate Annualized Failure Rate (AFR) and component reliability. In this paper, we show that service availability is significantly affected by soft failures and that this class of failures is becoming an important issue at large datacenters with minimum human intervention. Soft failures in the datacenter do not require actual hardware replacements, but still result in service downtime, and are equally important because they disrupt normal service operation. We show failure trends observed in a large datacenter deployment of commodity servers and motivate the need to modify conventional datacenter designs to help reduce soft failures and increase service availability.", acknowledgement = ack-nhfb, affiliation = "Sankar, S (Reprint Author), Microsoft Corp, Redmond, WA 98052 USA. Sankar, Sriram, Microsoft Corp, Redmond, WA 98052 USA. Sankar, Sriram; Gurumurthi, Sudhanva, Univ Virginia, Charlottesville, VA 22903 USA. Gurumurthi, Sudhanva, Adv Micro Devices Inc, AMD Res, Sunnyvale, CA 94088 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "sriram.sankar@microsoft.com Sudhanva.Gurumurthi@amd.com", da = "2019-06-20", doc-delivery-number = "AX5PM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "AFR; annualized failure rate; asset management; C.4 Performance of Systems; C.5.5 Servers; Characterization; Client-server systems; commodity servers; component reliability; computer centres; Data centers; Datacenter; datacenter deployments; datacenter designs; datacenter management; failure cause diagnosis; fault diagnosis; Hard disks; hardware replacements; Large-scale systems; machine failure diagnosis; machine failure fixing; Maintenance engineering; Management; management infrastructure; Market research; Reliability; repair process; service availability; soft failures; Transient analysis", number-of-cited-references = "8", research-areas = "Computer Science", times-cited = "0", unique-id = "Sankar:2014:SFL", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kim:2014:VPT, author = "Daehoon Kim and Hwanju Kim and Jaehyuk Huh", title = "{vCache}: Providing a Transparent View of the {LLC} in Virtualized Environments", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "109--112", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.20", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Since most of the current multi-core processors use a large last-level cache (LLC), efficient use of an LLC is critical for the overall performance of multi-cores. To improve the caching efficiency, page coloring is a representative software-based approach to allow the OS to control placement of pages on an LLC to improve their cache utility and to avoid conflicts among cores. However, system virtualization, with additional address translation by the hypervisor, can make page coloring techniques used by the guest OS ineffective, as guest physical addresses used by the guest OS for coloring differ from real addresses used for cache indexing in the LLCs. In this paper, we propose a novel LLC architecture to provide the guest OS with a flexible control over LLC placement in virtualized systems. The proposed vCache architecture can preserve coloring information set by the guest OS. In addition to color preservation, vCache can potentially eliminate the traditional limitation of page coloring, the cost of dynamic color changes for memory pages. By using the pollute buffer mechanism, one of the color-based cache optimization techniques, vCache shows performance improvement of benchmark applications up to 33\% without degrading the performance of another co-running application in the VM.", acknowledgement = ack-nhfb, affiliation = "Kim, D (Reprint Author), Korea Adv Inst Sci \& Technol, Dept Comp Sci, Taejon, South Korea. Kim, Daehoon; Kim, Hwanju; Huh, Jaehyuk, Korea Adv Inst Sci \& Technol, Dept Comp Sci, Taejon, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "daehoon@calab.kaist.ac.kr hjukim@calab.kaist.ac.kr jhuh@calab.kaist.ac.kr", da = "2019-06-20", doc-delivery-number = "AX5PM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "SW Computing R\&D Program of KEIT(UX-oriented Mobile SW Platform) --- Ministry of Trade, Industry, and Energy [2011-10041313]", funding-text = "This research was supported by the SW Computing R\&D Program of KEIT(2011-10041313, UX-oriented Mobile SW Platform) funded by the Ministry of Trade, Industry, and Energy.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "address translation; B.3.2.b Cache memories; benchmark applications; buffer mechanism; C.1.4.e Multi-core/single-chip multiprocessors; C.1.5.e Memory hierarchy; cache indexing; Cache partitioning; cache storage; Cache storage; cache utility improvement; caching efficiency improvement; co-running application; color-based cache optimization techniques; coloring information preservation; core conflict avoidance; dynamic color cost; guest OS; guest physical address; hypervisor; last-level cache; LLC architecture; LLC placement; Memory management; memory pages; Multicore processing; multicore processor performance; multiprocessing systems; operating systems (computers); Page coloring; page coloring; page placement control; paged storage; software-based approach; system virtualization; transparent LLC view; vCache architecture; Virtual machine monitors; virtual machines; virtualisation; Virtualization; virtualized environments; VM", number-of-cited-references = "8", research-areas = "Computer Science", researcherid-numbers = "Huh, Jaehyuk/C-1716-2011", times-cited = "2", unique-id = "Kim:2014:VPT", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2014:TCb, author = "Anonymous", title = "Table of Contents", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "C1--C1", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2368891", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2014:ICAa, author = "Anonymous", title = "{{\booktitle{IEEE Computer Architecture Letters}} Editorial Board}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "C2--C2", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2368892", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2014:ICAb, author = "Anonymous", title = "{{\booktitle{IEEE Computer Architecture Letters}}} Information for Authors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "C3--C3", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2368893", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2014:ICSb, author = "Anonymous", title = "{IEEE Computer Society} [advertisement]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "C4--C4", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2368894", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Liao:2015:AWL, author = "Jianwei Liao and Fengxiang Zhang and Li Li and Guoqiang Xiao", title = "Adaptive Wear-Leveling in Flash-Based Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "1--4", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2329871", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The paper presents an adaptive wear-leveling scheme based on several wear-thresholds in different periods. The basic idea behind this scheme is that blocks can have different wear-out speeds and the wear-leveling mechanism does not conduct data migration until the erasure counts of some hot blocks hit a threshold. Through a series of emulation experiments based on several realistic disk traces, we show that the proposed wear-leveling mechanism can reduce total erasure counts and yield uniform erasure counts among all blocks at the late lifetime of the storage devices. As a result, not only can the performance of storage systems be advanced, the lifespan of the flash-based memory can also be extended to a certain degree.", acknowledgement = ack-nhfb, affiliation = "Liao, JW (Reprint Author), Southwest Univ, Coll Comp \& Informat Sci, Chongqing, Peoples R China. Liao, Jianwei; Zhang, Fengxiang; Li, Li; Xiao, Guoqiang, Southwest Univ, Coll Comp \& Informat Sci, Chongqing, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "liaojianwei@il.is.s.u-okyo.ac.jp zhangfx@swu.edu.cn lily@swu.edu.cn gqxiao@swu.edu.cn", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Adaptive systems; adaptive wear-leveling; Ash; Benchmark testing; data migration; delayed migration; disk traces; emulation experiments; Equations; erasure evenness; extending lifetime; flash memories; flash-based memory; Flash-based storage devices; Market research; Servers; Standards; total erasure count reduction; wear; wear-leveling; wear-leveling mechanism; wear-out speeds; wear-thresholds", number-of-cited-references = "11", ORCID-numbers = "Liao, Jianwei/0000-0001-6149-6650", research-areas = "Computer Science", researcherid-numbers = "Liao, Jianwei/C-5339-2016", times-cited = "4", unique-id = "Liao:2015:AWL", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2015:IIC, author = "Anonymous", title = "2014 Index {{\booktitle{IEEE Computer Architecture Letters}}} Vol. 13", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "1--5", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2387774", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Indexes", } @Article{Chen:2015:HSC, author = "Jie Chen and Guru Venkataramani", title = "A Hardware-Software Cooperative Approach for Application Energy Profiling", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "5--8", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2323711", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Energy consumption by software applications is a critical issue that determines the future of multicore software development. In this article, we propose a hardware-software cooperative approach that uses hardware support to efficiently gather the energy-related hardware counters during program execution, and utilizes parameter estimation models in software to compute the energy consumption by instructions at a finer grain level (say basic block). We design mechanisms to minimize collinearity in profiler data, and present results to validate our energy estimation methodology.", acknowledgement = ack-nhfb, affiliation = "Chen, J (Reprint Author), George Washington Univ, Dept Elect \& Comp Engn, Washington, DC 20052 USA. Chen, Jie; Venkataramani, Guru, George Washington Univ, Dept Elect \& Comp Engn, Washington, DC 20052 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "jiec@gwu.edu guruv@gwu.edu", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "application energy profiling; Benchmark testing; Energy consumption; energy consumption; energy debugging; energy estimation; energy estimation methodology; Energy profiling; energy-related hardware counters; Estimation; Hardware; hardware-software codesign; hardware-software cooperative approach; Mathematical model; multicore software development; multiprocessing systems; Parameter estimation; parameter estimation models; power aware computing; profiler data collinearity; program execution; Software; software applications", keywords-plus = "POWER", number-of-cited-references = "12", oa = "Bronze", research-areas = "Computer Science", times-cited = "1", unique-id = "Chen:2015:HSC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kim:2015:ASM, author = "Dae-Hyun Kim and Prashant J. Nair and Moinuddin K. Qureshi", title = "Architectural Support for Mitigating Row Hammering in {DRAM} Memories", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "9--12", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2332177", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "DRAM scaling has been the prime driver of increasing capacity of main memory systems. Unfortunately, lower technology nodes worsen the cell reliability as it increases the coupling between adjacent DRAM cells, thereby exacerbating different failure modes. This paper investigates the reliability problem due to Row Hammering, whereby frequent activations of a given row can cause data loss for its neighboring rows. As DRAM scales to lower technology nodes, the threshold for the number of row activations that causes data loss for the neighboring rows reduces, making Row Hammering a challenging problem for future DRAM chips. To overcome Row Hammering, we propose two architectural solutions: First, Counter-Based Row Activation (CRA), which uses a counter with each row to count the number of row activations. If the count exceeds the row hammering threshold, a dummy activation is sent to neighboring rows proactively to refresh the data. Second, Probabilistic Row Activation (PRA), which obviates storage overhead of tracking and simply allows the memory controller to proactively issue dummy activations to neighboring rows with a small probability for all memory access. Our evaluations show that these solutions are effective at mitigating Row hammering while causing negligible performance loss (< 1 percent).", acknowledgement = ack-nhfb, affiliation = "Kim, DH (Reprint Author), Georgia Inst Technol, Dept ECE, Atlanta, GA 30363 USA. Kim, Dae-Hyun; Nair, Prashant J.; Qureshi, Moinuddin K., Georgia Inst Technol, Dept ECE, Atlanta, GA 30363 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "dhkim@ece.gatech.edu pnair6@ece.gatech.edu moin@ece.gatech.edu", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "architectural support; cell reliability; Computer architecture; counter-based row activation; data errors; data retention; DRAM chips; DRAM memories; DRAM scaling; Dynamic random access memory; Dynamic random access memory, row hammering, data retention, data errors; Leakage currents; Logic gates; Microprocessors; probabilistic row activation; probability; Radiation detectors; Random access memory; reliability; reliability problem; row hammering; Transistors", number-of-cited-references = "10", research-areas = "Computer Science", times-cited = "23", unique-id = "Kim:2015:ASM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Nathan:2015:AGC, author = "Ralph Nathan and Daniel J. Sorin", title = "{Argus-G}: Comprehensive, Low-Cost Error Detection for {GPGPU} Cores", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "13--16", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2298391", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We have developed and evaluated Argus-G, an error detection scheme for general purpose GPU (GPGPU) cores. Argus-G is a natural extension of the Argus error detection scheme for CPU cores, and we demonstrate how to modify Argus such that it is compatible with GPGPU cores. Using an RTL prototype, we experimentally show that Argus-G can detect the vast majority of injected errors at relatively low performance, area, and power costs.", acknowledgement = ack-nhfb, affiliation = "Nathan, R (Reprint Author), Duke Univ, Durham, NC 27708 USA. Nathan, Ralph; Sorin, Daniel J., Duke Univ, Durham, NC 27708 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "ralph.nathan@duke.edu sorin@ee.duke.edu", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Argus-G; Benchmark testing; Conferences; CPU cores; error detection; fault tolerance; general purpose GPU cores; GPGPU cores; Graphics processing units; graphics processing units; Graphics processors; Hardware; Hardware design languages; Instruction sets; low-cost error detection; Registers", number-of-cited-references = "18", research-areas = "Computer Science", times-cited = "0", unique-id = "Nathan:2015:AGC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{O:2015:CCI, author = "Seongil O and Sanghyuk Kwon and Young Hoon Son and Yujin Park and Jung Ho Ahn", title = "{CIDR}: a Cache Inspired Area-Efficient {DRAM} Resilience Architecture against Permanent Faults", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "17--20", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2324894", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "area overhead; area-efficient DRAM resilience architecture; Arrays; augmented cache; bit errors; Bloom filter; cache data array; cache storage; cache tags; cache-inspired DRAM resilience architecture; CIDR; Circuit faults; cost-sensitive main-memory DRAM devices; data structures; Decoding; device failure rates; DRAM arrays; DRAM chips; DRAM, error resilience, permanent faults, row and column sparing, Bloom filter, DRAM-side caching; energy overhead minimization; error statistics; fault diagnosis; faulty cells; I/O pads; memory architecture; permanent faults; processor-memory interfaces; Random access memory; Resilience; single-bit error rates; Testing; testing phase", } @Article{Seongil:2015:CCI, author = "O. Seongil and Sanghyuk Kwon and Young Hoon Son and Yujin Park and Jung Ho Ahn", title = "{CIDR}: a Cache Inspired Area-Efficient {DRAM} Resilience Architecture against Permanent Faults", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "17--20", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2324894", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Faulty cells have become major problems in cost-sensitive main-memory DRAM devices. Conventional solutions to reduce device failure rates due to cells with permanent faults, such as populating spare rows and relying on error-correcting codes, have had limited success due to high area overheads. In this paper, we propose CIDR, a novel cache-inspired DRAM resilience architecture, which substantially reduces the area overhead of handling bit errors from these faulty cells. A DRAM device adopting CIDR has a small cache next to its I/O pads to replace accesses to the addresses that include the faulty cells with ones that correspond to the cache data array. We minimize the energy overhead of accessing the cache tags for every read or write by adding a Bloom filter in front of the cache. The augmented cache is programmed once during the testing phase and is out of the critical path on normal accesses because both cache and DRAM arrays are accessed in parallel, making CIDR transparent to existing processor-memory interfaces. Compared to the conventional architecture relying on spare rows, CIDR lowers the area overhead of achieving equal failure rates over a wide range of single-bit error rates, such as 23.6 x lower area overhead for a bit-error rate of 10(-5) and a device failure rate of 10(-3).", acknowledgement = ack-nhfb, affiliation = "Seongil, O (Reprint Author), Seoul Natl Univ, Dept Transdisciplinary Studies, Seoul, South Korea. Seongil, O.; Kwon, Sanghyuk; Son, Young Hoon; Park, Yujin; Ahn, Jung Ho, Seoul Natl Univ, Dept Transdisciplinary Studies, Seoul, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "swdfish@snu.ac.kr kkwon114@snu.ac.kr yhson96@snu.ac.kr comesay@snu.ac.kr gajh@snu.ac.kr", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bloom filter; DRAM; DRAM-side caching; error resilience; permanent faults; row and column sparing", number-of-cited-references = "13", oa = "Bronze", research-areas = "Computer Science", times-cited = "0", unique-id = "Seongil:2015:CCI", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Gupta:2015:CEO, author = "Ujjwal Gupta and Umit Y. Ogras", title = "Constrained Energy Optimization in Heterogeneous Platforms Using Generalized Scaling Models", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "21--25", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2326603", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Platform energy consumption and responsiveness are two major considerations for mobile systems since they determine the battery life and user satisfaction, respectively. We first present models for power consumption, response time and energy consumption of heterogeneous mobile platforms. Then, we use these models to optimize the energy consumption of baseline platforms under response time and temperature constraints with and without introducing new resources. We show that the optimal design choices depend on dynamic power management algorithm, and adding new resources is more energy efficient than scaling existing resources alone.", acknowledgement = ack-nhfb, affiliation = "Gupta, U (Reprint Author), Arizona State Univ, Sch Elect Comp \& Energy Engn, Tempe, AZ 85281 USA. Gupta, Ujjwal; Ogras, Umit Y., Arizona State Univ, Sch Elect Comp \& Energy Engn, Tempe, AZ 85281 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "ujjwal@asu.edu umit@asu.edu", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "battery life determine; Computers; constrained energy optimization; dynamic power management algorithm; Energy consumption; Energy optimization; generalized scaling models; heterogeneous architectures; heterogeneous mobile platforms; Mobile communication; mobile computing; mobile platforms; mobile systems; MpSoC; Multicore processing; Optimization; performance; platform energy consumption; power aware computing; power consumption; Power demand; response time; temperature constraints; Time factors; user satisfaction", keywords-plus = "AMDAHLS LAW; MULTIAMDAHL; ACCELERATOR; MANAGEMENT; CPU; ERA", number-of-cited-references = "19", research-areas = "Computer Science", times-cited = "1", unique-id = "Gupta:2015:CEO", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Farmahini-Farahani:2015:DAA, author = "Amin Farmahini-Farahani and Jung Ho Ahn and Katherine Morrow and Nam Sung Kim", title = "{DRAMA}: an Architecture for Accelerated Processing Near Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "26--29", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2333735", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Improving energy efficiency is crucial for both mobile and high-performance computing systems while a large fraction of total energy is consumed to transfer data between storage and processing units. Thus, reducing data transfers across the memory hierarchy of a processor (i.e., off-chip memory, on-chip caches, and register file) can greatly improve the energy efficiency. To this end, we propose an architecture, DRAMA, that 3D-stacks coarse-grain reconfigurable accelerators (CGRAs) atop off-chip DRAM devices. DRAMA does not require changes to the DRAM device architecture, apart from through-silicon vias (TSVs) that connect the DRAM device's internal I/O bus to the CGRA layer. We demonstrate that DRAMA can reduce the energy consumption to transfer data across the memory hierarchy by 66-95 percent while achieving speedups of up to 18 x over a commodity processor.", acknowledgement = ack-nhfb, affiliation = "Farmahini-Farahani, A (Reprint Author), Univ Wisconsin, Dept Elect \& Comp Engn, 1415 Johnson Dr, Madison, WI 53706 USA. Farmahini-Farahani, Amin; Morrow, Katherine; Kim, Nam Sung, Univ Wisconsin, Dept Elect \& Comp Engn, Madison, WI 53706 USA. Ahn, Jung Ho, Seoul Natl Univ, Dept Transdisciplinary Studies, Seoul 151742, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "farmahinifar@wisc.edu gajh@snu.ac.kr kati@engr.wisc.edu nskim3@wisc.edu", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "3D-stacking; 3D-stacks coarse-grain reconfigurable accelerators; accelerated near memory processing; Acceleration; accelerator; Arrays; data transfers; DRAM; DRAM chips; DRAM devices; DRAMA architecture; dynamic random access memory; energy conservation; energy consumption reduction; energy efficiency; energy-efficient computing; high-performance computing systems; Kernel; memory hierarchy; Memory management; mobile computing systems; Near memory processing; Near memory processing, DRAM, 3D-stacking, energy-efficient computing, accelerator; processing units; Random access memory; Registers; storage management; storage units; through-silicon vias; total energy fraction; TSV", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "7", unique-id = "Farmahini-Farahani:2015:DAA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Carlson:2015:EPM, author = "Trevor E. Carlson and Siddharth Nilakantan and Mark Hempstead and Wim Heirman", title = "Epoch Profiles: Microarchitecture-Based Application Analysis and Optimization", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "30--33", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2329873", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The performance of data-intensive applications, when running on modern multi-and many-core processors, is largely determined by their memory access behavior. Its most important contributors are the frequency and latency of off-chip accesses and the extent to which long-latency memory accesses can be overlapped with useful computation or with each other. In this paper we present two methods to better understand application and microarchitectural interactions. An epoch profile is an intuitive way to understand the relationships between three important characteristics: the on-chip cache size, the size of the reorder window of an out-of-order processor, and the frequency of processor stalls caused by long-latency, off-chip requests (epochs). By relating these three quantities one can more easily understand an application's memory reference behavior and thus significantly reduce the design space. While epoch profiles help to provide insight into the behavior of a single application, developing an understanding of a number of applications in the presence of area and core count constraints presents additional challenges. Epoch-based microarchitectural analysis is presented as a better way to understand the trade-offs for memory-bound applications in the presence of these physical constraints. Through epoch profiling and optimization, one can significantly reduce the multidimensional design space for hardware/software optimization through the use of high-level model-driven techniques.", acknowledgement = ack-nhfb, affiliation = "Carlson, TE (Reprint Author), Univ Ghent, Sint Pietersnieuwstr 41, B-9000 Ghent, East Flanders, Belgium. Carlson, Trevor E., Univ Ghent, B-9000 Ghent, East Flanders, Belgium. Nilakantan, Siddharth; Hempstead, Mark, Drexel Univ, Dept Elect \& Comp Engn, Bossone Res Ctr, Philadelphia, PA 19104 USA. Heirman, Wim, Intel Corp, Leuven, Flemish Brabant, Belgium.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "trevor.carlson@elis.ugent.be sn446@drexel.edu mhempstead@drexel.edu wim.heirman@intel.com", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Computational modeling; Frequency measurement; memory-level parallelism; Microarchitecture; Microarchitecture analysis; Out of order; System-on-chip; visualization", number-of-cited-references = "6", oa = "Green Published", ORCID-numbers = "Carlson, Trevor/0000-0001-8742-134X Nilakantan, Siddharth/0000-0003-1067-700X Heirman, Wim/0000-0003-2286-1525", research-areas = "Computer Science", researcherid-numbers = "Carlson, Trevor/M-4945-2016", times-cited = "0", unique-id = "Carlson:2015:EPM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Power:2015:GGH, author = "Jason Power and Joel Hestness and Marc S. Orr and Mark D. Hill and David A. Wood", title = "{gem5-gpu}: a Heterogeneous {CPU--GPU} Simulator", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "34--36", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2299539", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", abstract = "gem5-gpu is a new simulator that models tightly integrated CPU-GPU systems. It builds on gem5, a modular full-system CPU simulator, and GPGPU-Sim, a detailed GPGPU simulator. gem5-gpu routes most memory accesses through Ruby, which is a highly configurable memory system in gem5. By doing this, it is able to simulate many system configurations, ranging from a system with coherent caches and a single virtual address space across the CPU and GPU to a system that maintains separate GPU and CPU physical address spaces. gem5-gpu can run most unmodified CUDA 3.2 source code. Applications can launch non-blocking kernels, allowing the CPU and GPU to execute simultaneously. We present gem5-gpu's software architecture and a brief performance validation. We also discuss possible extensions to the simulator. gem5-gpu is open source and available at gem5-gpu.cs.wisc.edu.", acknowledgement = ack-nhfb, affiliation = "Power, J (Reprint Author), Univ Wisconsin, Dept Comp Sci, 1210 W Dayton St, Madison, WI 53706 USA. Power, Jason; Hestness, Joel; Orr, Marc S.; Hill, Mark D.; Wood, David A., Univ Wisconsin, Dept Comp Sci, Madison, WI 53706 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "powerjg@cs.wisc.edu hestness@cs.wisc.edu morr@cs.wisc.edu markhill@cs.wisc.edu david@cs.wisc.edu", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Coherence; Computational modeling; Computer architecture; computer architecture; gem5-gpu simulator; general-purpose graphics processors; GPGPUSim; Graphics processing units; graphics processing units; heterogeneous (hybrid) systems; heterogeneous CPU-GPU simulator; Kernel; Modeling techniques; modular full-system CPU simulator; nonblocking kernels; Object oriented modeling; Protocols; simulators; software architecture", number-of-cited-references = "9", research-areas = "Computer Science", times-cited = "62", unique-id = "Power:2015:GGH", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Manatunga:2015:HSS, author = "Dilan Manatunga and Joo Hwan Lee and Hyesoon Kim", title = "Hardware Support for Safe Execution of Native Client Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "37--40", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2309601", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Over the past few years, there has been vast growth in the area of the web browser as an applications platform. One example of this trend is Google's Native Client (NaCl) platform, which is a software-fault isolation mechanism that allows the running of native x86 or ARM code on the browser. One of the security mechanisms employed by NaCl is that all branches must jump to the start of a valid instruction. In order to achieve this criteria though, all return instructions are replaced by a specific branch instruction sequence, which we call NaCl returns, that are guaranteed to return to a valid instruction. However, these NaCl returns lose the advantage of the highly accurate return-address stack (RAS) in exchange for the less accurate indirect branch predictor. In this paper, we propose a NaCl-RAS mechanism that can identify and accurately predict 76.9 on average compared to the 39.5 of a traditional BTB predictor.", acknowledgement = ack-nhfb, affiliation = "Manatunga, D (Reprint Author), Georgia Inst Technol, Sch Comp Sci, Atlanta, GA 30332 USA. Manatunga, Dilan; Lee, Joo Hwan; Kim, Hyesoon, Georgia Inst Technol, Sch Comp Sci, Atlanta, GA 30332 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "dmanatunga@gatech.edu joohwan.lee@gatech.edu hyesoon@cc.gatech.edu", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Accuracy; ARM code; Benchmark testing; branch instruction sequence; branch prediction accuracy; BTB predictor; Detectors; fault diagnosis; Google; Hardware; hardware support; NaCl-RAS mechanism; Native client; native client applications; native x86; online front-ends; return address prediction; return-address stack; safe execution; Security; security mechanism; security of data; Software; software fault isolation; software-fault isolation mechanism; Web browser", keywords-plus = "SANDBOX; CODE", number-of-cited-references = "5", research-areas = "Computer Science", times-cited = "0", unique-id = "Manatunga:2015:HSS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Liu:2015:LHP, author = "Longjun Liu and Chao Li and Hongbin Sun and Yang Hu and Jingmin Xin and Nanning Zheng and Tao Li", title = "Leveraging Heterogeneous Power for Improving Datacenter Efficiency and Resiliency", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "41--45", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2363084", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Power mismatching between supply and demand has emerged as a top issue in modern datacenters that are under-provisioned or powered by intermittent power supplies. Recent proposals are primarily limited to leveraging uninterruptible power supplies (UPS) to handle power mismatching, and therefore lack the capability of efficiently handling the irregular peak power mismatches. In this paper we propose hPower, the first heterogeneous energy buffering strategy that incorporates supercapacitors into existing datacenters to handle power mismatch. Our technique exploits power supply diversity and smart load assignment to provide efficiency-aware and emergency-aware power mismatch management. We show that hPower could improve energy efficiency by 30 percent, extend UPS lifetime by 4.3 x, and reduce system downtime by 36 percent. It allows datacenters to adapt themselves to various power supply anomalies, thereby improving operational efficiency and resiliency.", acknowledgement = ack-nhfb, affiliation = "Liu, LJ (Reprint Author), Xi An Jiao Tong Univ, Sch Elect \& Informat Engn, Xian 710049, Peoples R China. Liu, Longjun; Sun, Hongbin; Xin, Jingmin; Zheng, Nanning, Xi An Jiao Tong Univ, Sch Elect \& Informat Engn, Xian 710049, Peoples R China. Li, Chao, Shanghai Jiao Tong Univ, Dept Comp Sci \& Engn, Shanghai 200030, Peoples R China. Hu, Yang; Li, Tao, Univ Florida, Dept Elect \& Comp Engn, Gainesville, FL USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "longjun.liu@stu.xjtu.edu.cn lichao@cs.sjtu.edu.cn hsun@mail.xjtu.edu.cn huyang.ece@ufl.edu jxin@mail.xjtu.edu.cn nnzheng@mail.xjtu.edu.cn taoli@ece.ufl.edu", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Batteries; computer centres; computer system implementation; Computer System Implementation; computer system implementation; data center efficiency; data center resiliency; efficiency-aware power mismatch management; emergency-aware power mismatch management; energy conservation; Energy efficiency; Energy-aware systems; Energy-Aware Systems; heterogeneous energy buffering strategy; heterogeneous power; hPower; performance of systems; Performance of Systems; power aware computing; Power demand; power mismatching; power supply anomalies; power supply diversity; Servers; smart load assignment; Supercapacitors; supercapacitors; system downtime reduction; uninterruptible power supplies; Uninterruptible power systems; UPS", number-of-cited-references = "16", research-areas = "Computer Science", times-cited = "6", unique-id = "Liu:2015:LHP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Wang:2015:LNV, author = "Rui Wang and Wangyuan Zhang and Tao Li and Depei Qian", title = "Leveraging Non-Volatile Storage to Achieve Versatile Cache Optimizations", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "46--49", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2298412", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The efficiency of caches plays a vital role in microprocessor. In this paper, we introduce a novel and flexible cache substrate that employs non-volatile yet versatile SRAM (NV2-SRAM) cell design, which synergistically integrates new memory devices into the standard SRAM cells. Our experiments show that it can achieve a 67 percent energy saving and 3: 1 x reliability improvement over the SRAM based cache, outperforming the drowsy cache design in terms of both power efficiency and reliability. Moreover, the proposed cache architecture can be used to improve the performance of prefetching schemes by 10 percent.", acknowledgement = ack-nhfb, affiliation = "Wang, R (Reprint Author), Beihang Univ, Sch Comp Sci \& Engn, State Key Lab Software Dev Environm, Beijing 100191, Peoples R China. Wang, Rui; Qian, Depei, Beihang Univ, Sch Comp Sci \& Engn, State Key Lab Software Dev Environm, Beijing 100191, Peoples R China. Zhang, Wangyuan; Li, Tao, Univ Florida, ECE Dept, Gainesville, FL 32611 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "rui.wang@jsi.buaa.edu.cn zhangwangyuan@gmail.com taoli@ece.ufl.edu depeiq@buaa.edu.cn", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache architecture; Cache memories; cache storage; Computer architecture; energy saving; flexible cache substrate; low-power design; Magnetic tunneling; memory structures; microprocessor; Microprocessors; Nonvolatile memory; nonvolatile storage; nonvolatile yet versatile SRAM cell design; NV2-SRAM cell design; Prefetching; prefetching schemes; reliability improvement; SRAM; SRAM based cache; SRAM cells; SRAM chips; storage management; versatile cache optimizations", number-of-cited-references = "19", research-areas = "Computer Science", times-cited = "1", unique-id = "Wang:2015:LNV", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Mohammadi:2015:DDB, author = "Milad Mohammadi and Song Han and Tor M. Aamodt and William J. Dally", title = "On-Demand Dynamic Branch Prediction", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "50--53", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2330820", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "In out-of-order (OoO) processors, speculative execution with high branch prediction accuracy is employed to achieve good single thread performance. In these processors the branch prediction unit tables (BPU) are accessed in parallel with the instruction cache before it is known whether a fetch group contains branch instructions. For integer applications, we find 85 percent of BPU lookups are done for non-branch operations and of the remaining lookups, 42 percent are done for highly biased branches that can be predicted statically with high accuracy. We evaluate on-demand branch prediction (ODBP), a novel technique that uses compiler generated hints to identify those instructions that can be more accurately predicted statically to eliminate unnecessary BPU lookups. We evaluate an implementation of ODBP that combines static and dynamic branch prediction. For a four wide superscalar processor, ODBP delivers as much as 9 percent improvement in average energy-delay (ED) product, 7 percent core average energy saving, and 3 percent speedup. ODBP also enables the use of large BPU's for a given power budget.", acknowledgement = ack-nhfb, affiliation = "Mohammadi, M (Reprint Author), Stanford Univ, Dept Elect Engn, Stanford, CA 94305 USA. Mohammadi, Milad; Han, Song; Dally, William J., Stanford Univ, Dept Elect Engn, Stanford, CA 94305 USA. Aamodt, Tor M., Univ British Columbia, Dept Elect \& Comp Engn, Vancouver, BC V6T 1Z4, Canada.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "milad@stanford.edu songhan@stanford.edu aamodt@ece.ubc.ca dally@stanford.edu", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Accuracy; ahead prediction; BPU lookup; branch instruction; branch prediction accuracy; branch prediction unit table; cache storage; compiler generated hints; Computer architecture; core average energy saving; ED product; Energy efficiency; energy-delay product; energy-delay product optimization; Equations; instruction cache; instruction sets; Mathematical model; nonbranch operation; ODBP; on-demand branch prediction; on-demand dynamic branch prediction; OoO processor; out-of-order processor; parallel processing; Pipelines; power budget; program compilers; Program processors; single thread performance; speculative execution; static and dynamic branch prediction hybrid; static branch prediction; superscalar processor; table lookup; Tin", keywords-plus = "MICROPROCESSOR; DESIGN", number-of-cited-references = "27", research-areas = "Computer Science", times-cited = "1", unique-id = "Mohammadi:2015:DDB", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Azriel:2015:PMT, author = "Leonid Azriel and Avi Mendelson and Uri Weiser", title = "Peripheral Memory: a Technique for Fighting Memory Bandwidth Bottleneck", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "54--57", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2319077", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Memory bottleneck has always been a major cause for limiting the performance of computer systems. While in the past latency was the major concern, today, lack of bandwidth becomes a limiting factor as well, as a result of exploiting more parallelism with the growing number of cores per die, which intensifies the pressure on the memory bus. In such an environment, any additional traffic to memory, such as the I/O traffic may lead to degradation of the overall performance of the system. This work introduces the concept of Peripheral Memory, a software controlled memory that resides in the I/O domain and can be used for offloading I/O traffic from CPU memory. The Peripheral Memory handles `I/O exclusive data', data originated and terminated at I/O domain, and which does not need any processing by the CPU. The paper analyses the impact of I/O traffic on the overall performance of the current systems and demonstrates that in numerous applications, I/O exclusive data occupies major part of memory bandwidth, as a result, degrading CPU processing performance and increasing power. The paper considers four different implementations of the Peripheral Memory: pageable, pinned, non-coherent split-traffic and copy-on-access. Our full-system simulator indicates that non-coherent split traffic configuration is the most efficient implementation, which can provide up to four times speedup in the I/O processing rate for typical I/O intensive applications. In addition, based on Power model and measurements tools, the paper demonstrates that the Peripheral Memory in a server system can lead to reduction of tens of Watts in the overall system power consumption or 10-20 percent of the system power budget.", acknowledgement = ack-nhfb, affiliation = "Azriel, L (Reprint Author), Technion Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa, Israel. Azriel, Leonid; Mendelson, Avi; Weiser, Uri, Technion Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa, Israel.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "leonida@tx.technion.ac.il avi.mendelson@tce.technion.ac.il uri.weiser@ee.technion.ac.il", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; bandwidth allocation; Benchmark testing; computer system performance; CPU memory; full-system simulator; I/O domain; I/O traffic offloading; input/output devices; Instruction sets; interconnection architectures; main memory; memory bandwidth bottleneck; memory bus; Memory management; parallelism; performance evaluation; Performance evaluation; peripheral memory; Power demand; Power measurement; server system; software controlled memory; storage management; system buses", keywords-plus = "NETWORK; I/O", number-of-cited-references = "12", research-areas = "Computer Science", times-cited = "1", unique-id = "Azriel:2015:PMT", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Wang:2015:PTM, author = "Zhaoguo Wang and Han Yi and Ran Liu and Mingkai Dong and Haibo Chen", title = "Persistent Transactional Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "58--61", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2329832", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper proposes persistent transactional memory (PTM), a new design that adds durability to transactional memory (TM) by incorporating with the emerging non-volatile memory (NVM). PTM dynamically tracks transactional updates to cache lines to ensure the ACI (atomicity, consistency and isolation) properties during cache flushes and leverages an undo log in NVM to ensure PTM can always consistently recover transactional data structures from a machine crash. This paper describes the PTM design based on Intel's restricted transactional memory. A preliminary evaluation using a concurrent key/value store and a database with a cache-based simulator shows that the additional cache line flushes are small.", acknowledgement = ack-nhfb, affiliation = "Wang, ZG (Reprint Author), Shanghai Jiao Tong Univ, Shanghai Key Lab Scalable Comp \& Syst, Shanghai 200030, Peoples R China. Wang, Zhaoguo; Yi, Han; Liu, Ran; Dong, Mingkai; Chen, Haibo, Shanghai Jiao Tong Univ, Shanghai Key Lab Scalable Comp \& Syst, Shanghai 200030, Peoples R China. Wang, Zhaoguo; Yi, Han; Liu, Ran; Dong, Mingkai; Chen, Haibo, Shanghai Jiao Tong Univ, Inst Parallel \& Distributed Syst, Shanghai 200030, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "tigerwang1986@gmail.com ken.yihan1990@gmail.com naruilone@gmail.com mingkaidong@gmail.com haibochen@sjtu.edu.cn", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "ACI properties; Batteries; cache line flushes; cache storage; cache-based simulator; Computer crashes; Data structures; Databases; Hardware; Hardware transactional memory; non-volatile random access memory; Nonvolatile memory; nonvolatile memory; NVM; persistent transactional memory; PTM design; Registers", number-of-cited-references = "10", research-areas = "Computer Science", times-cited = "5", unique-id = "Wang:2015:PTM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Gibert:2015:PSR, author = "Enric Gibert and Raul Mart{\'\i}nez and Carlos Madriles and Josep M. Codina", title = "Profiling Support for Runtime Managed Code: Next Generation Performance Monitoring Units", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "62--65", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2321398", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Given the increase of runtime managed code environments in desktop, server, and mobile segments, agile, flexible, and accurate performance monitoring capabilities are required in order to perform wise code transformations and optimizations. Common profiling strategies, mainly based on instrumentation and current performance monitoring units (PMUs), are not adequate and new innovative designs are necessary. In this paper, we present the desired characteristics of what we call next generation PMUs and advocate for hardware/software collaborative approaches where hardware implements the profiling hooks and mechanisms and software implements the complex heuristics. We then propose a first design in which the hardware uses a small, yet flexible table to profile specific code regions and the software decides what/when/how to profile. This first design meets all required features and we aim it as the seed for future PMUs extensions to enable novel dynamic code transformations and optimizations.", acknowledgement = ack-nhfb, affiliation = "Gibert, E (Reprint Author), Intel Corp, Intel Labs, Intel Barcelona Res Ctr IBRC, Edifici Nexus 2, Planta 0-D, Jordi Girona 29, Barcelona, Spain. Gibert, Enric; Martinez, Raul; Madriles, Carlos; Codina, Josep M., Intel Corp, Intel Labs, Intel Barcelona Res Ctr IBRC, Barcelona, Spain.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "enric.gibert.codina@intel.com raul.martinez@intel.com carlos.madriles.gimeno@intel.com josep.m.codina@intel.com", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "dynamic code optimizations; dynamic code transformations; groupware; Hardware; hardware-software collaborative approaches; instrumentation; Instruments; just in time (JIT) compiler; Monitoring; next generation performance monitoring units; optimising compilers; Optimization; Performance monitoring unit (PMU); Phasor measurement units; PMUs; profiling; profiling hooks; profiling support; Runtime; runtime managed code; runtime managed code environments; Software; software performance evaluation; system monitoring", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "3", unique-id = "Gibert:2015:PSR", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{You:2015:QSA, author = "Daecheol You and Ki-Seok Chung", title = "Quality of Service-Aware Dynamic Voltage and Frequency Scaling for Embedded {GPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "66--69", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2319079", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Dynamic voltage and frequency scaling (DVFS) is a key technique for reducing processor power consumption in mobile devices. In recent years, mobile system-on-chips (SoCs) has supported DVFS for embedded graphics processing units (GPUs) as the processing power of embedded GPUs has been increasing steadily. The major challenge of applying DVFS to a processing unit is to meet the quality of service (QoS) requirement while achieving a reasonable power reduction. In the case of GPUs, the QoS requirement can be specified as the frame-per-second (FPS) which the target GPU should achieve. The proposed DVFS technique ensures a consistent GPU performance by scaling the operating clock frequency in a way that it maintains a uniform FPS.", acknowledgement = ack-nhfb, affiliation = "You, D (Reprint Author), Hanyang Univ, Dept Elect Comp \& Commun Engn, Embedded Syst Chip Lab, Seoul 133791, South Korea. You, Daecheol; Chung, Ki-Seok, Hanyang Univ, Dept Elect Comp \& Commun Engn, Embedded Syst Chip Lab, Seoul 133791, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "khsrdc@hanyang.ac.kr kchung@hanyang.ac.kr", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; Clocks; Correlation; DVFS; dynamic voltage scaling; embedded GPU; Energy consumption; energy-aware systems; frequency scaling; graphics processing unit; Graphics processing units; graphics processing units; Graphics processors; hardware/software interfaces; low-power design; mobile device; mobile system-on-chips; operating clock frequency; power aware computing; processor power consumption; Quality of service; quality of service; SoC; System-on-chip; system-on-chip", number-of-cited-references = "9", research-areas = "Computer Science", times-cited = "9", unique-id = "You:2015:QSA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Lee:2015:RDA, author = "Sungjin Lee and Jihong Kim and Arvind", title = "Refactored Design of {I/O} Architecture for Flash Storage", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "70--74", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2329423", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Flash storage devices behave quite differently from hard disk drives (HDDs); a page on flash has to be erased before it can be rewritten, and the erasure has to be performed on a block which consists of a large number of contiguous pages. It is also important to distribute writes evenly among flash blocks to avoid premature wearing. To achieve interoperability with existing block I/O subsystems for HDDs, NAND flash devices employ an intermediate software layer, called the flash translation layer (FTL), which hides these differences. Unfortunately, FTL implementations require powerful processors with a large amount of DRAM in flash controllers and also incur many unnecessary I/O operations which degrade flash storage performance and lifetime. In this paper, we present a refactored design of I/O architecture for flash storage which dramatically increases storage performance and lifetime while decreasing the cost of the flash controller. In comparison with page-level FTL, our preliminary experiments show a reduction of 19 percent in I/O operations, improvement of I/O performance by 9 percent and storage lifetime by 36 percent. In addition, our scheme uses only 1/128 DRAM memory in the flash controller.", acknowledgement = ack-nhfb, affiliation = "Lee, S (Reprint Author), MIT, 77 Massachusetts Ave, Cambridge, MA 02139 USA. Lee, Sungjin; Arvind, MIT, Cambridge, MA 02139 USA. Kim, Jihong, Seoul Natl Univ, Sch Comp Sci \& Engn, Seoul, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "chamdoo@gmail.com jihong@davinci.snu.ac.kr arvind@csail.mit.edu", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; block I/O subsystems; Computer architecture; DRAM chips; DRAM memory; file systems; flash blocks; flash memories; flash storage; flash translation layer; hard disk drives; HDDs; I/O architecture; I/O architectures; input-output programs; intermediate software layer; interoperability; NAND circuits; NAND flash devices; NAND flash memory; page-level FTL; Performance evaluation; premature wearing; Random access memory; Runtime; Storage management; Storage systems", number-of-cited-references = "15", research-areas = "Computer Science", times-cited = "7", unique-id = "Lee:2015:RDA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Yuan:2015:SGR, author = "Fengkai Yuan and Zhenzhou Ji and Suxia Zhu", title = "Set-Granular Regional Distributed Cooperative Caching", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "75--78", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2319258", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The last level cache (LLC) in private configurations offer lower latency and isolation but extinguishes the possibility of sharing underutilized cache resources. Cooperative Caching (CC) provides capacity sharing by spilling a line evicted from one cache to another. Current studies focus on efficient capacity sharing, while the adaptability of CC to manycore environment deserves more attentions. In this paper, we present Set-granular Regional Distributed Cooperative Caching to optimize CC in manycore CMPs with private LLCs. We achieve efficient capacity sharing by a low-traffic global receiver tracking mechanism and provide a method to manage set-grain cache state transitions for exclusive LLCs. Experiment results show that SRDCC performs better than baseline system, running different workloads varying in receiver-spiller number and distribution, in execution time up to 15.55 percent and memory access up to 40.25 percent, at a negligible cost of network traffics (6.21 percent more than baseline system at worst).", acknowledgement = ack-nhfb, affiliation = "Yuan, FK (Reprint Author), Harbin Inst Technol, Sch Comp Sci \& Technol, Harbin 150006, Heilongjiang, Peoples R China. Yuan, Fengkai; Ji, Zhenzhou; Zhu, Suxia, Harbin Inst Technol, Sch Comp Sci \& Technol, Harbin 150006, Heilongjiang, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "yuan.fengkai@gmail.com jizhenzhou@hit.edu.cn", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache coherence protocol; cache resource sharing; Cache storage; cache storage; capacity sharing; CC; chip multiprocessors; cooperative caching; Cooperative caching; last level cache; LLC; manycore CMP; multiprocessing systems; on-chip networks; private cache configuration; Protocols; Radiation detectors; receiver-spiller distribution; receiver-spiller number; Receivers; set-grain cache state transition; set-granular regional distributed cooperative caching; Telecommunication traffic; Tiled CMP", keywords-plus = "CHIP MULTIPROCESSORS", number-of-cited-references = "9", ORCID-numbers = "Yuan, Fengkai/0000-0003-2615-8642", research-areas = "Computer Science", times-cited = "2", unique-id = "Yuan:2015:SGR", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Lee:2015:SSI, author = "Junghee Lee and Youngjae Kim and Jongman Kim and Galen M. Shipman", title = "Synchronous {I/O} Scheduling of Independent Write Caches for an Array of {SSDs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "79--82", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2298394", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Solid-state drives (SSD) offer a significant performance improvement over the hard disk drives (HDD), however, it can exhibit a significant variance in latency and throughput due to internal garbage collection (GC) process on the SSD. When the SSDs are configured in a RAID, the performance variance of individual SSDs could significantly degrade the overall performance of the RAID of SSDs. The internal cache on the RAID controller can help mitigate the performance variability issues of SSDs in the array; however, the state-of-the-art cache algorithm of the RAID controller does not consider the characteristics of SSDs. In this paper, we examine the most recent write cache algorithm for the array of disks, and propose a synchronous independent write cache (SIW) algorithm. We also present a pre-parity-computation technique for the RAID of SSDs with parity computations, which calculates parities of blocks in advance before they are stored in the write cache. With this new technique, we propose a complete paradigm shift in the design of write cache. In our evaluation study, large write requests dominant workloads show up to about 50 and 20 percent improvements in average response times on RAID-0 and RAID-5 respectively as compared to the state-of-the-art write cache algorithm.", acknowledgement = ack-nhfb, affiliation = "Lee, J (Reprint Author), Univ Texas San Antonio, San Antonio, TX 78229 USA. Lee, Junghee, Univ Texas San Antonio, San Antonio, TX 78229 USA. Kim, Youngjae, Ajou Univ, Suwon 441749, South Korea. Kim, Jongman, Georgia Inst Technol, Atlanta, GA 30332 USA. Shipman, Galen M., Oak Ridge Natl Lab, Oak Ridge, TN USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "junghee.lee@utsa.edu youkim@gmail.com jkim@ece.gatech.edu gshipman@ornl.gov", da = "2019-06-20", doc-delivery-number = "CL1QK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Algorithm design and analysis; Arrays; cache storage; Delays; disks array; flash memory; GC process; hard disk drives; HDD; I/O scheduling; independent write caches; input-output programs; internal cache; internal garbage collection process; memory architecture; pre-parity-computation technique; RAID; RAID controller; Redundant array of independent disks (RAID); Redundant Array of Independent Disks (RAID); Redundant array of independent disks (RAID); scheduling; SIW algorithm; solid-state drive (SSD); Solid-State Drive (SSD); solid-state drive (SSD); solid-state drives; SSD; Strips; Synchronization; synchronous I/O scheduling; synchronous independent write cache algorithm; Time factors; write cache; Write cache; write cache; write cache design; write requests", number-of-cited-references = "8", research-areas = "Computer Science", times-cited = "2", unique-id = "Lee:2015:SSI", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2015:RSW, author = "Anonymous", title = "Rock Stars of Wearables", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "83--83", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2447192", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2015:RSC, author = "Anonymous", title = "Rock Stars of Cybersecurity 2015 Conference", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "84--84", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2447191", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2015:TCa, author = "Anonymous", title = "Table of Contents", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "C1--C1", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2446391", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2015:ICAa, author = "Anonymous", title = "{{\booktitle{IEEE Computer Architecture Letters}} Editorial Board}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "C2--C2", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2446392", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2015:ICAb, author = "Anonymous", title = "{{\booktitle{IEEE Computer Architecture Letters}}} Information for Authors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "C3--C3", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2446393", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2015:ICSa, author = "Anonymous", title = "{IEEE Computer Society}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "1", pages = "C4--C4", month = jan # "\slash " # jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2446394", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Shi:2015:CLM, author = "Qingchuan Shi and Henry Hoffmann and Omer Khan", title = "A Cross-Layer Multicore Architecture to Tradeoff Program Accuracy and Resilience Overheads", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "85--89", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2365204", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "To protect multicores from soft-error perturbations, resiliency schemes have been developed with high coverage but high power/performance overheads (similar to 2x). We observe that not all soft-errors affect program correctness, some soft-errors only affect program accuracy, i.e., the program completes with certain acceptable deviations from soft-error free outcome. Thus, it is practical to improve processor efficiency by trading off resilience overheads with program accuracy. We propose the idea of declarative resilience that selectively applies resilience schemes to both crucial and non-crucial code, while ensuring program correctness. At the application level, crucial and non-crucial code is identified based on its impact on the program outcome. The hardware collaborates with software support to enable efficient resilience with 100 percent soft-error coverage. Only program accuracy is compromised in the worst-case scenario of a soft-error strike during non-crucial code execution. For a set of multithreaded benchmarks, declarative resilience improves completion time by an average of 21 percent over state-of-the-art hardware resilience scheme that protects all executed code. Its performance overhead is similar to 1.38x over a multicore that does not support resilience.", acknowledgement = ack-nhfb, affiliation = "Shi, QC (Reprint Author), Univ Connecticut, Dept Elect \& Comp Engn, Storrs, CT 06269 USA. Shi, Qingchuan; Khan, Omer, Univ Connecticut, Dept Elect \& Comp Engn, Storrs, CT 06269 USA. Hoffmann, Henry, Univ Chicago, Dept Comp Sci, Chicago, IL 60637 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "qingchuan.shi@uconn.edu hankhoffmann@cs.uchicago.edu khan@uconn.edu", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Accuracy; Benchmark testing; code execution; Instruction sets; multi-threading; multicore architecture; Multicore processing; multicores; multithreaded benchmark; program accuracy; Resilience; resilience overhead; Soft errors; soft-error perturbation; soft-errors; software architecture; software fault tolerance", number-of-cited-references = "23", research-areas = "Computer Science", times-cited = "4", unique-id = "Shi:2015:CLM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Zheng:2015:ACC, author = "Zhong Zheng and Zhiying Wang and Mikko Lipasti", title = "Adaptive Cache and Concurrency Allocation on {GPGPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "90--93", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2359882", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Memory bandwidth is critical to GPGPU performance. Exploiting locality in caches can better utilize memory bandwidth. However, memory requests issued by excessive threads cause cache thrashing and saturate memory bandwidth, degrading performance. In this paper, we propose adaptive cache and concurrency allocation (CCA) to prevent cache thrashing and improve the utilization of bandwidth and computational resources, hence improving performance. According to locality and reuse distance of access patterns in GPGPU program, warps on a stream multiprocessor are dynamically divided into three groups: cached, bypassed, and waiting. The data cache accommodates the footprint of cached warps. Bypassed warps cannot allocate cache lines in the data cache to prevent cache thrashing, but are able to take advantage of available memory bandwidth and computational resource. Waiting warps are de-scheduled. Experimental results show that adaptive CCA can significant improve benchmark performance, with 80 percent harmonic mean IPC improvement over the baseline.", acknowledgement = ack-nhfb, affiliation = "Zheng, Z (Reprint Author), Natl Univ Def Technol, State Key Lab High Performance Comp, Changsha, Hunan, Peoples R China. Zheng, Zhong; Wang, Zhiying, Natl Univ Def Technol, State Key Lab High Performance Comp, Changsha, Hunan, Peoples R China. Zheng, Zhong; Wang, Zhiying, Natl Univ Def Technol, Sch Comp, Changsha, Hunan, Peoples R China. Lipasti, Mikko, Univ Wisconsin, Dept Elect \& Comp Engn, Madison, WI 54706 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "zheng\_zhong@nudt.edu.cn zywang@nudt.edu.cn mikko@engr.wisc.edu", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "CSC; 863 Program [2012AA010905]; NSFC [61070037, 61272143, 61272144, 61103016, 61202121]; NUDT [B120607]; RFDP [20114307120013]; NSF [CCF-1318298]", funding-text = "This work was partially supported by CSC, 863 Program (2012AA010905), NSFC (61070037, 61272143, 61272144, 61103016, 61202121), NUDT(B120607), RFDP (20114307120013), and NSF (CCF-1318298).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "access patterns; adaptive cache-and-concurrency allocation; Bandwidth; bandwidth utilization improvement; benchmark performance improvement; Benchmark testing; bypassed warps; cache; cache lines; cache locality; Cache memory; cache storage; cache thrashing prevention; cached warps; CCA; computational resource utilization improvement; concurrency; concurrency control; Concurrent computing; GPGPU; GPGPU performance improvement; graphics processing units; harmonic mean IPC improvement; Instruction sets; memory bandwidth saturation; multi-threading; multiprocessing systems; performance evaluation; Resource management; reuse distance; stream multiprocessor; waiting warp descheduling", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "4", unique-id = "Zheng:2015:ACC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Nowatzki:2015:GBP, author = "Tony Nowatzki and Venkatraman Govindaraju and Karthikeyan Sankaralingam", title = "A Graph-Based Program Representation for Analyzing Hardware Specialization Approaches", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "94--98", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2476801", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Hardware specialization has emerged as a promising paradigm for future microprocessors. Unfortunately, it is natural to develop and evaluate such architectures within end-to-end vertical silos spanning application, language/compiler, hardware design and evaluation tools, leaving little opportunity for cross-architecture analysis and innovation. This paper develops a novel program representation suitable for modeling heterogeneous architectures with specialized hardware, called the transformable dependence graph (TDG), which combines semantic information about program properties and low-level hardware events in a single representation. We demonstrate, using four example architectures from the literature, that the TDG is a feasible, simple, and accurate modeling technique for transparent specialization architectures, enabling cross-domain comparison and design-space exploration.", acknowledgement = ack-nhfb, affiliation = "Nowatzki, T (Reprint Author), Univ Wisconsin, Dept Comp Sci, 1210 W Dayton St, Madison, WI 53706 USA. Nowatzki, Tony; Govindaraju, Venkatraman; Sankaralingam, Karthikeyan, Univ Wisconsin, Dept Comp Sci, Madison, WI 53706 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "tjn@cs.wisc.edu venkatra@cs.wisc.edu karu@cs.wisc.edu", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Acceleration; accelerators; computer architecture; Computer architecture; dependence graphs; graph theory; graph-based program representation; Hardware specialization; hardware specialization approach; heterogeneous architecture modeling; Load modeling; Microarchitecture; microprocessors; Microprocessors; modelling; program representation; Specialization; Specialization, accelerators, modelling, program representation, dependence graphs; TDG; transformable dependence graph; Transforms", number-of-cited-references = "10", research-areas = "Computer Science", times-cited = "2", unique-id = "Nowatzki:2015:GBP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kim:2015:PEM, author = "Seung Hun Kim and Dohoon Kim and Changmin Lee and Won Seob Jeong and Won Woo Ro and Jean-Luc Gaudiot", title = "A Performance-Energy Model to Evaluate Single Thread Execution Acceleration", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "99--102", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2368144", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "It is well known that the cost of executing the sequential portion of a program will limit and sometimes even eclipse the gains brought by processing in parallel the rest of the program. This means that serious consideration should be brought to bear on accelerating the execution of this unavoidable sequential part. Such acceleration can be done by boosting the operating frequency in a symmetric multicore processor. In this paper, we derive a performance and power model to describe the implications of this approach. From our model, we show that the ratio of performance over energy during the sequential part improves with an increase in the number of cores. In addition, we demonstrate how to determine with the proposed model the optimal frequency boosting ratio which maximizes energy efficiency.", acknowledgement = ack-nhfb, affiliation = "Kim, SH (Reprint Author), Yonsei Univ, Sch Elect \& Elect Engn, Seoul 120749, South Korea. Kim, Seung Hun; Kim, Dohoon; Lee, Changmin; Jeong, Won Seob; Ro, Won Woo, Yonsei Univ, Sch Elect \& Elect Engn, Seoul 120749, South Korea. Gaudiot, Jean-Luc, Univ Calif Irvine, Dept Elect Engn \& Comp Sci, Irvine, CA USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "kseunghun@gmail.com dohoon.kim@yonsei.ac.kr exahz@yonsei.ac.kr ws.jeong@yonsei.ac.kr wro@yonsei.ac.kr gaudiot@uci.edu", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Basic Science Research Program through the National Research Foundation of Korea (NRF) --- Ministry of Education [2010-0013202]; National Science Foundation [CCF-1439165]", funding-text = "This work was supported in part by the Basic Science Research Program through the National Research Foundation of Korea (NRF) funded by the Ministry of Education (2010-0013202) and by the National Science Foundation, under award CCF-1439165. Any opinions, findings, and conclusions expressed in this material are those of the authors and do not necessarily reflect the views of the sponsors. W. W. Ro is the corresponding author.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "energy efficiency; Energy management; energy-aware systems; Mathematical model; Microprocessors; Multicore processing; multiprocessing systems; multiprocessor systems; optimal frequency boosting ratio; parallel processing; performance evaluation; Performance evaluation; Performance modeling; performance-energy model; power aware computing; Power demand; single thread execution acceleration; symmetric multicore processor", keywords-plus = "AMDAHLS LAW; ERA", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "0", unique-id = "Kim:2015:PEM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Song:2015:ARL, author = "William Song and Saibal Mukhopadhyay and Sudhakar Yalamanchili", title = "Architectural Reliability: Lifetime Reliability Characterization and Management of Many-Core Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "103--106", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2340873", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper presents a lifetime reliability characterization of many-core processors based on a full-system simulation of integrated microarchitecture, power, thermal, and reliability models. Under normal operating conditions, our model and analysis reveal that the mean-time-to-failure of cores on the die show normal distribution. From the processor-level perspective, the key insight is that reducing the variance of the distribution can improve lifetime reliability by avoiding early failures. Based on this understanding, we present two variance reduction techniques for proactive reliability management; (i) proportional dynamic voltage-frequency scaling (DVFS) and (ii) coordinated thread swapping. A major advantage of using variance reduction techniques is that the improvement of system lifetime reliability can be achieved without adding design margins or spare components.", acknowledgement = ack-nhfb, affiliation = "Song, W (Reprint Author), Georgia Inst Technol, Sch Elect \& Comp Engn, Atlanta, GA 30332 USA. Song, William; Mukhopadhyay, Saibal; Yalamanchili, Sudhakar, Georgia Inst Technol, Sch Elect \& Comp Engn, Atlanta, GA 30332 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "wjhsong@gatech.edu saibal.mukhopadhyay@ece.gatech.edu sudha.yalamanchili@ece.gatech.edu", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Semiconductor Research Corporation [2084.001]; IBM/SRC Graduate Fellowship; Sandia National Laboratories", funding-text = "This research was supported by the Semiconductor Research Corporation under task \#2084.001, IBM/SRC Graduate Fellowship, and Sandia National Laboratories.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "architectural reliability; Benchmark testing; Computer architecture; Computer architecture, lifetime estimation, modeling, semiconductor device reliability, simulation; coordinated thread swapping; core mean-time-to-failure; Degradation; design margins; DVFS; full-system simulation; Gaussian distribution; integrated circuit design; Integrated circuit reliability; integrated microarchitecture; lifetime estimation; lifetime reliability characterization; many-core processors; Microarchitecture; microprocessor chips; modeling; multiprocessing systems; normal operating conditions; power aware computing; power models; Program processors; proportional dynamic voltage-frequency scaling; reliability models; semiconductor device reliability; simulation; spare components; thermal models; variance reduction techniques", number-of-cited-references = "12", research-areas = "Computer Science", times-cited = "3", unique-id = "Song:2015:ARL", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Poluri:2015:SET, author = "Pavan Poluri and Ahmed Louri", title = "A Soft Error Tolerant Network-on-Chip Router Pipeline for Multi-Core Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "107--110", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2360686", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Network-on-Chip (NoC) paradigm is rapidly evolving into an efficient interconnection network to handle the strict communication requirements between the increasing number of cores on a single chip. Diminishing transistor size is making the NoC increasingly vulnerable to both hard faults and soft errors. This paper concentrates on soft errors in NoCs. A soft error in an NoC router results in significant consequences such as data corruption, packet retransmission and deadlock among others. To this end, we propose Soft Error Tolerant NoC Router (STNR) architecture, that is capable of detecting and recovering from soft errors occurring in different control stages of the routing pipeline. STNR exploits the use of idle cycles inherent in NoC packet routing pipeline to perform time redundant executions necessary for soft error tolerance. In doing so, STNR is able to detect and correct all single transient faults in the control stages of the pipeline. Simulation results using PARSEC and SPLASH-2 benchmarks show that STNR is able to accomplish such high level of soft error protection with a minimal impact on latency (an increase of 1.7 and 1.6 percent respectively). Additionally, STNR incurs an area overhead of 7 percent and power overhead of 13 percent as compared to the baseline unprotected router.", acknowledgement = ack-nhfb, affiliation = "Poluri, P (Reprint Author), Univ Arizona, Dept Elect \& Comp Engn, Tucson, AZ 85721 USA. Poluri, Pavan; Louri, Ahmed, Univ Arizona, Dept Elect \& Comp Engn, Tucson, AZ 85721 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "pavanp@email.arizona.edu louri@email.arizona.edu", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "US National Science Foundation (NSF) [CNS-1318997, ECCS-0725765, ECCS-1342702, CCF-1420681]", funding-text = "This research was supported by US National Science Foundation (NSF) awards CNS-1318997, ECCS-0725765, ECCS-1342702 and CCF-1420681.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; data corruption; deadlock; fault tolerance; hard faults; idle cycles; integrated circuit reliability; interconnection network; Multicore processing; multicore systems; multiprocessing systems; network routing; Network-on-chip; network-on-chip; Network-on-chip; NoC packet routing pipeline; packet retransmission; PARSEC; performance; Pipelines; Ports (Computers); radiation hardening (electronics); reliability; Resource management; single chip; single transient faults; soft error; soft error protection; soft error tolerance; soft error tolerant network-on-chip router pipeline; soft error tolerant NoC router architecture; SPLASH-2 benchmarks; STNR architecture; Switches; time redundant executions; Transient analysis; transistor size", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "6", unique-id = "Poluri:2015:SET", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Xiao:2015:SCD, author = "Canwen Xiao and Yue Yang and Jianwen Zhu", title = "A Sufficient Condition for Deadlock-Free Adaptive Routing in Mesh Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "111--114", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2363829", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Deadlock remains a central problem in interconnection network. In this paper, we establish a new theory of deadlock-free flow control for k-ary, n-cube mesh network, which enables the use of any minimal-path adaptive routing algorithms while avoiding deadlock. We prove that the proposed flow control algorithm is a sufficient condition for deadlock freedom in any minimal path, adaptive routing algorithms on k-ary, n-cube mesh network.", acknowledgement = ack-nhfb, affiliation = "Xiao, CW (Reprint Author), Natl Univ Def Technol, Changsha, Hunan, Peoples R China. Xiao, Canwen, Natl Univ Def Technol, Changsha, Hunan, Peoples R China. Yang, Yue; Zhu, Jianwen, Univ Toronto, Dept Elect \& Comp Engn, Toronto, ON, Canada.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "cwxiao@nudt.edu.cn yyang@eecg.toronto.edu jzhu@eecg.toronto.edu", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "``863'' program of China [2012AA01A301, 2013AA014301]", funding-text = "This work is supported by ``863'' program of China (2012AA01A301, 2013AA014301).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Adaptive systems; Aerospace electronics; concurrency control; deadlock avoidance; Deadlock-Free; deadlock-free adaptive routing; deadlock-free flow control; flow control; interconnection network; k-ary; k-ary mesh network; mesh networks; Mesh networks; minimal path routing algorithm; minimal-path adaptive routing algorithms; Multiprocessor interconnection; multiprocessor interconnection networks; n-cube mesh network; Routing; sufficient condition; System recovery; Wireless mesh networks", number-of-cited-references = "7", research-areas = "Computer Science", researcherid-numbers = "Yang, Yue/N-8370-2019", times-cited = "1", unique-id = "Xiao:2015:SCD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Mittal:2015:ATE, author = "Sparsh Mittal and Jeffrey S. Vetter", title = "{AYUSH}: a Technique for Extending Lifetime of {SRAM--NVM} Hybrid Caches", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "115--118", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2355193", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Recently, researchers have explored way-based hybrid SRAM-NVM (non-volatile memory) last level caches (LLCs) to bring the best of SRAM and NVM together. However, the limited write endurance of NVMs restricts the lifetime of these hybrid caches. We present AYUSH, a technique to enhance the lifetime of hybrid caches, which works by using data-migration to preferentially use SRAM for storing frequently-reused data. Microarchitectural simulations confirm that AYUSH achieves larger improvement in lifetime than a previous technique and also maintains performance and energy efficiency. For single, dual and quad-core workloads, the average increase in cache lifetime with AYUSH is 6.90, 24.06 and 47.62x, respectively.", acknowledgement = ack-nhfb, affiliation = "Mittal, S (Reprint Author), Oak Ridge Natl Lab, Div Math \& Comp Sci, Oak Ridge, TN 37831 USA. Mittal, Sparsh; Vetter, Jeffrey S., Oak Ridge Natl Lab, Div Math \& Comp Sci, Oak Ridge, TN 37831 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "mittals@ornl.gov vetter@ornl.gov", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "AYUSH; Benchmark testing; Cache memory; cache storage; data-migration; device lifetime; energy efficiency; Energy loss; hybrid cache; last level caches; microarchitectural simulation; Non-volatile memory (NVM); nonvolatile memory; Nonvolatile memory; Radiation detectors; Random access memory; SRAM; SRAM chips; SRAM-NVM cache; SRAM-NVM hybrid caches; write endurance", keywords-plus = "ENERGY; MODEL", number-of-cited-references = "17", ORCID-numbers = "Vetter, Jeffrey/0000-0002-2449-6720 Mittal, Sparsh/0000-0002-2908-993X", research-areas = "Computer Science", times-cited = "11", unique-id = "Mittal:2015:ATE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Manohar:2015:CSD, author = "Rajit Manohar", title = "Comparing Stochastic and Deterministic Computing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "119--122", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2412553", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Technology scaling has raised the specter of myriads of cheap, but unreliable and/or stochastic devices that must be creatively combined to create a reliable computing system. This has renewed the interest in computing that exploits stochasticity-embracing, not combating the device physics. If a stochastic representation is used to implement a programmable general-purpose architecture akin to CPUs, GPUs, or FPGAs, the preponderance of evidence indicates that most of the system energy will be expended in communication and storage as opposed to computation. This paper presents an analytical treatment of the benefits and drawbacks of adopting a stochastic approach by examining the cost of representing a value. We show both scaling laws and costs for low precision representations. We also analyze the cost of multiplication implemented using stochastic versus deterministic approaches, since multiplication is the prototypical inexpensive stochastic operation. We show that the deterministic approach compares favorably to the stochastic approach when holding precision and reliability constant.", acknowledgement = ack-nhfb, affiliation = "Manohar, R (Reprint Author), Cornell Univ, Cornell Tech, New York, NY 10011 USA. Manohar, Rajit, Cornell Univ, Cornell Tech, New York, NY 10011 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "rajit@csl.cornell.edu", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Complexity theory; Computer architecture; deterministic computing; Encoding; field programmable gate arrays; FPGAs; general-purpose architecture; GPUs; graphics processing units; Logic gates; Receivers; reliable computing system; stochastic computing; Stochastic processes; stochastic processes; stochastic representation", number-of-cited-references = "18", research-areas = "Computer Science", times-cited = "5", unique-id = "Manohar:2015:CSD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Seo:2015:DDF, author = "Bon-Keun Seo and Seungryoul Maeng and Joonwon Lee and Euiseong Seo", title = "{DRACO}: a Deduplicating {FTL} for Tangible Extra Capacity", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "123--126", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2350984", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The rapid random access of SSDs enables efficient searching of redundant data and their deduplication. However, the space earned from deduplication cannot be used as permanent storage because it must be reclaimed when deduplication is cancelled as a result of an update to the deduplicated data. To overcome this limitation, we propose a novel FTL scheme that enables the gained capacity to be used as permanent storage space for the file system layer. The proposed approach determines the safe amount of gained capacity that can be provided to the upper layer based on the compression rate prediction scheme. It then secures the required space by compressing cold data when capacity overflow occurs from cancelled deduplication. Our evaluation with a kernel source repository showed that the file system obtained approximately 79 percent additional capacity by the proposed scheme.", acknowledgement = ack-nhfb, affiliation = "Seo, BK (Reprint Author), Korea Adv Inst Sci \& Technol, Dept Comp Sci, Taejon 305701, South Korea. Seo, Bon-Keun; Maeng, Seungryoul, Korea Adv Inst Sci \& Technol, Dept Comp Sci, Taejon 305701, South Korea. Lee, Joonwon; Seo, Euiseong, Sungkyunkwan Univ, Coll Informat \& Commun Engn, Suwon 440746, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "joonwon@skku.edu euiseong@skku.edu", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Basic Science Research Program through the National Research Foundation of Korea [2012R1A1A2A10038823]", funding-text = "This research was supported by Basic Science Research Program through the National Research Foundation of Korea (2012R1A1A2A10038823).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "capacity overflow; cold data compression; compression; compression rate prediction scheme; data compression; data deduplication; Data structures; deduplicating FTL; deduplication; disc drives; DRACO; Entropy; file system layer; file systems; File systems; file systems; flash memories; flash memory; Flash memory; flash memory; flash translation layer; FTL; kernel source repository; Linux; over-provisioning; permanent storage space; rapid random access; redundant data searching; SDRAM; SSD; storage management; storage reclamation; tangible extra capacity", number-of-cited-references = "6", research-areas = "Computer Science", researcherid-numbers = "Maeng, Seungryoul/C-1882-2011", times-cited = "2", unique-id = "Seo:2015:DDF", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Seshadri:2015:FBB, author = "Vivek Seshadri and Kevin Hsieh and Amirali Boroum and Donghyuk Lee and Michael A. Kozuch and Onur Mutlu and Phillip B. Gibbons and Todd C. Mowry", title = "Fast Bulk Bitwise {AND} and {OR} in {DRAM}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "127--131", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2434872", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Bitwise operations are an important component of modern day programming, and are used in a variety of applications such as databases. In this work, we propose a new and simple mechanism to implement bulk bitwise AND and OR operations in DRAM, which is faster and more efficient than existing mechanisms. Our mechanism exploits existing DRAM operation to perform a bitwise AND/OR of two DRAM rows completely within DRAM. The key idea is to simultaneously connect three cells to a bitline before the sense-amplification. By controlling the value of one of the cells, the sense amplifier forces the bitline to the bitwise AND or bitwise OR of the values of the other two cells. Our approach can improve the throughput of bulk bitwise AND/OR operations by 9.7X and reduce their energy consumption by 50.5.X. Since our approach exploits existing DRAM operation as much as possible, it requires negligible changes to DRAM logic. We evaluate our approach using a real-world implementation of a bit-vector based index for databases. Our mechanism improves the performance of commonly-used range queries by 30 percent on average.", acknowledgement = ack-nhfb, affiliation = "Seshadri, V (Reprint Author), Carnegie Mellon Univ, Pittsburgh, PA 15213 USA. Seshadri, Vivek; Hsieh, Kevin; Boroum, Amirali; Lee, Donghyuk; Mutlu, Onur; Mowry, Todd C., Carnegie Mellon Univ, Pittsburgh, PA 15213 USA. Kozuch, Michael A.; Gibbons, Phillip B., Intel Pittsburgh, Pittsburgh, PA USA.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [0953246, 1212962, 1320531]; Intel Science and Tech. Center; Samsung; Google; Facebook; SRC", funding-text = "This work was supported by NSF (awards 0953246, 1212962, and 1320531), and Intel Science and Tech. Center, Samsung, Google, Facebook, and SRC.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "bit-vector based index; bitwise AND/OR; bulk-bitwise AND operation; bulk-bitwise OR operation; Capacitors; cell value control; Computer architecture; database indexing; Decoding; DRAM; DRAM chips; DRAM memory; DRAM memory, bitwise AND/OR, performance; DRAM operation; energy consumption reduction; logic gates; performance; performance improvement; Program processors; Random access memory; range queries; sense amplifier; sense-amplification; Throughput; throughput improvement", number-of-cited-references = "20", research-areas = "Computer Science", times-cited = "21", unique-id = "Seshadri:2015:FBB", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Altaf:2015:LPM, author = "Muhammad Shoaib Bin Altaf and David A. Wood", title = "{LogCA}: a Performance Model for Hardware Accelerators", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "132--135", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2360182", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "To address the Dark Silicon problem, architects have increasingly turned to special-purpose hardware accelerators to improve the performance and energy efficiency of common computational kernels, such as encryption and compression. Unfortunately, the latency and overhead required to off-load a computation to an accelerator sometimes outweighs the potential benefits, resulting in a net decrease in performance or energy efficiency. To help architects and programmers reason about these trade-offs, we have developed the LogCA model, a simple performance model for hardware accelerators. LogCA provides a simplified abstraction of a hardware accelerator characterized by five key parameters. We have validated the model against a variety of accelerators, ranging from on-chip cryptographic accelerators in Sun's UltraSparc T2 and Intel's Sandy Bridge to both discrete and integrated GPUs.", acknowledgement = ack-nhfb, affiliation = "Bin Altaf, MS (Reprint Author), Univ Wisconsin, Madison, WI 53706 USA. Bin Altaf, Muhammad Shoaib; Wood, David A., Univ Wisconsin, Madison, WI 53706 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "shoaibbinalt@wisc.edu david@cs.wisc.edu", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [CNS-1117280, CCF-1218323, CNS-1302260]", funding-text = "We thank Mark Hill, Michael Swift, Rathijit Sen, and the members of the Wisconsin Multifacet group for their comments on the paper. This work is supported in part with NSF grants CNS-1117280, CCF-1218323, and CNS-1302260. The views expressed herein are not necessarily those of the NSF. Professor Wood has significant financial interests in AMD, Google and Panasas.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Accelerators; compression; computational kernel; Computational modeling; cryptography; dark silicon problem; encryption; energy conservation; energy efficiency; GPU; graphics processing units; Hardware accelerators; heterogeneous systems; Intel Sandy Bridge; LogCA model; Modeling; modeling techniques; modeling techniques,; on-chip cryptographic accelerator; Performance evaluation; performance model; performance of systems; special-purpose hardware accelerator; UltraSparc T2", number-of-cited-references = "12", research-areas = "Computer Science", times-cited = "0", unique-id = "Altaf:2015:LPM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Diamantopoulos:2015:MMI, author = "Dionysios Diamantopoulos and Sotirios Xydis and Kostas Siozios and Dimitrios Soudris", title = "Mitigating Memory-Induced Dark Silicon in Many-Accelerator Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "136--139", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2410791", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Many-Accelerator (MA) systems have been introduced as a promising architectural paradigm that can boost performance and improve power of general-purpose computing platforms. In this paper, we focus on the problem of resource under-utilization, i.e., Dark Silicon, in FPGA-based MA platforms. We show that except the typically expected peak power budget, on-chip memory resources form a severe under-utilization factor in MA platforms, leading up to 75 percent of dark silicon. Recognizing that static memory allocation-the de-facto mechanism supported by modern design techniques and synthesis tools-forms the main source of memory-induced Dark Silicon, we introduce a novel framework that extends conventional high level synthesis (HLS) with dynamic memory management (DMM) features, enabling accelerators to dynamically adapt their allocated memory to the runtime memory requirements, thus maximizing the overall accelerator count through effective sharing of FPGA's memories resources. We show that our technique delivers significant gains in FPGA's accelerators density, i.e. 3.8x, and application throughput up to 3.1x and 21.4x for shared and private memory accelerators.", acknowledgement = ack-nhfb, affiliation = "Diamantopoulos, D (Reprint Author), Natl Tech Univ Athens, Sch Elect \& Comp Engn, Athens, Greece. Diamantopoulos, Dionysios; Xydis, Sotirios; Siozios, Kostas; Soudris, Dimitrios, Natl Tech Univ Athens, Sch Elect \& Comp Engn, Athens, Greece.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "diamantd@microlab.ntua.gr sxydis@microlab.ntua.gr ksiop@microlab.ntua.gr dsoudris@microlab.ntua.gr", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "E.C. [644906]", funding-text = "This research is partially supported by the E.C. funded program AEGLE under H2020 Grant Agreement No: 644906.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "de-facto mechanism; DMM feature; dynamic memory management; dynamic memory management feature; Dynamic scheduling; Field programmable gate arrays; field programmable gate arrays; FPGA-based MA platform; high-level synthesis; high-level synthesis tool; HLS tool; MA system; Many-accelerator architectures; many-accelerator architectures; Many-accelerator architectures; Memory management; memory-induced dark silicon source; modern design technique; Network architecture; on-chip memory resource; peak power budget; power aware computing; Resource management; severe under-utilization factor; silicon; static memory allocation; storage management; System-on-chip; Throughput", number-of-cited-references = "14", ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847 Siozios, Kostas/0000-0002-0285-2202", research-areas = "Computer Science", researcherid-numbers = "Soudris, Dimitrios/O-8843-2019 Siozios, Kostas/F-9726-2011", times-cited = "1", unique-id = "Diamantopoulos:2015:MMI", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Poremba:2015:NUF, author = "Matthew Poremba and Tao Zhang and Yuan Xie", title = "{NVMain 2.0}: a User-Friendly Memory Simulator to Model (Non-) Volatile Memory Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "140--143", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2402435", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "In this letter, a flexible memory simulator --- NVMain 2.0, is introduced to help the community for modeling not only commodity DRAMs but also emerging memory technologies, such as die-stacked DRAM caches, non-volatile memories (e.g., STT-RAM, PCRAM, and ReRAM) including multi-level cells (MLC), and hybrid non-volatile plus DRAM memory systems. Compared to existing memory simulators, NVMain 2.0 features a flexible user interface with compelling simulation speed and the capability of providing sub-array-level parallelism, fine-grained refresh, MLC and data encoder modeling, and distributed energy profiling.", acknowledgement = ack-nhfb, affiliation = "Poremba, M (Reprint Author), Penn State Univ, Dept Comp Sci \& Engn, University Pk, PA 16802 USA. Poremba, Matthew; Zhang, Tao; Xie, Yuan, Penn State Univ, Dept Comp Sci \& Engn, University Pk, PA 16802 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "poremba@cse.psu.edu zhangtao@cse.psu.edu yuanxie@cse.psu.edu", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [1218867, 1213052, 1409798]; Department of Energy [DE-SC0005026]", funding-text = "Poremba, Zhang, and Xie were supported in part by NSF 1218867, 1213052, 1409798. This material was based on work supported by the Department of Energy under Award Number DE-SC0005026. Matthew Poremba is the corresponding author.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache storage; commodity DRAM; Computational modeling; Computer architecture; die-stacked DRAM cache; DRAM chips; DRAM memory systems; flexible memory simulator; flexible user interface; Memory architecture; memory architecture; Memory architecture, random access memory, nonvolatile memory, phase change memory, SDRAM; Memory management; memory technology; multilevel cells; nonvolatile memory; Nonvolatile memory; nonvolatile memory system; NVMain 2.0; PCRAM; phase change memories; phase change memory; Phase change random access memory; random access memory; ReRAM; SDRAM; STT-RAM; user interfaces; user-friendly memory simulator", number-of-cited-references = "10", research-areas = "Computer Science", times-cited = "36", unique-id = "Poremba:2015:NUF", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Vandierendonck:2015:EEB, author = "Hans Vandierendonck and Ahmad Hassan and Dimitrios S. Nikolopoulos", title = "On the Energy-Efficiency of Byte-Addressable Non-Volatile Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "144--147", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2355195", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Non-volatile memory (NVM) technology holds promise to replace SRAM and DRAM at various levels of the memory hierarchy. The interest in NVM is motivated by the difficulty faced in scaling DRAM beyond 22 nm and, long-term, lower cost per bit. While offering higher density and negligible static power (leakage and refresh), NVM suffers increased latency and energy per memory access. This paper develops energy and performance models of memory systems and applies them to understand the energy-efficiency of replacing or complementing DRAM with NVM. Our analysis focusses on the application of NVM in main memory. We demonstrate that NVM such as STT-RAM and RRAM is energy-efficient for memory sizes commonly employed in servers and high-end workstations, but PCM is not. Furthermore, the model is well suited to quickly evaluate the impact of changes to the model parameters, which may be achieved through optimization of the memory architecture, and to determine the key parameters that impact system-level energy and performance.", acknowledgement = ack-nhfb, affiliation = "Vandierendonck, H (Reprint Author), Queens Univ Belfast, Belfast BT7 1NN, Antrim, North Ireland. Vandierendonck, Hans; Nikolopoulos, Dimitrios S., Queens Univ Belfast, Belfast BT7 1NN, Antrim, North Ireland. Hassan, Ahmad, SAP Belfast, Belfast, Antrim, North Ireland.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "h.vandierendonck@qub.ac.uk ahmad.hassan@sap.com d.nikolopoulos@qub.ac.uk", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "People Programme (Marie Curie Actions) of the European Union's Seventh Framework Programme [327744]", funding-text = "This work was supported by the People Programme (Marie Curie Actions) of the European Union's Seventh Framework Programme (FP7/2007-2013), grant agreement no. 327744.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "byte-addressable nonvolatile memory technology; Computational modeling; DRAM; DRAM chips; energy; energy conservation; energy efficiency; Enery efficiency; impact system-level energy; Main memory systems; Main memory systems, non-volatile memory, energy, modeling; Mathematical model; memory architecture; memory hierarchy; Memory management; memory systems; modeling; non-volatile memory; Nonvolatile memory; NVM technology; PCM; Phase change materials; Random access memory; RRAM; SRAM; SRAM chips; static power; STT-RAM", number-of-cited-references = "15", oa = "Green Published", research-areas = "Computer Science", times-cited = "0", unique-id = "Vandierendonck:2015:EEB", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Yavits:2015:RAP, author = "Leonid Yavits and Shahar Kvatinsky and Amir Morad and Ran Ginosar", title = "Resistive Associative Processor", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "148--151", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2374597", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Associative Processor (AP) combines data storage and data processing, and functions simultaneously as a massively parallel array SIMD processor and memory. Traditionally, AP is based on CMOS technology, similar to other classes of massively parallel SIMD processors. The main component of AP is a Content Addressable Memory (CAM) array. As CMOS feature scaling slows down, CAM experiences scalability problems. In this work, we propose and investigate an AP based on resistive CAM-the Resistive AP (ReAP). We show that resistive memory technology potentially allows scaling the AP from a few millions to a few hundred millions of processing units on a single silicon die. We compare the performance and power consumption of a ReAP to a CMOS AP and a conventional SIMD accelerator (GPU) and show that ReAP, although exhibiting higher power density, allows better scalability and higher performance.", acknowledgement = ack-nhfb, affiliation = "Yavits, L (Reprint Author), Technion Israel Inst Technol, Dept Elect Engn, IL-3200000 Haifa, Israel. Yavits, Leonid; Kvatinsky, Shahar; Morad, Amir; Ginosar, Ran, Technion Israel Inst Technol, Dept Elect Engn, IL-3200000 Haifa, Israel.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "yavits@txtechnion.ac.il skva@txtechnion.ac.il amirm@txtechnion.ac.il ran@ee.technion.ac.il", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Intel Collaborative Research Institute for Computational Intelligence; Hasso-Plattner-Institut", funding-text = "The authors would like to thank Uri Weiser for inspiring this research. This work was partially funded by the Intel Collaborative Research Institute for Computational Intelligence and by Hasso-Plattner-Institut.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Associative processing; associative processor; Associative Processor; associative processor; CAM array; CMOS feature scaling; CMOS integrated circuits; CMOS technology; complimentary metal oxide semiconductor; Computer aided manufacturing; content addressable memory array; content-addressable storage; data processing; data storage; GPU; graphics processing unit; in-memory computing; In-Memory Computing; in-memory computing; massively parallel array SIMD processor; memory function; memristor; Memristor; memristor; Memristors; parallel processing; Random access memory; ReAP; resistive associative processor; resistive RAM; Resistive RAM; resistive RAM; SIMD; SIMD accelerator", number-of-cited-references = "17", research-areas = "Computer Science", times-cited = "22", unique-id = "Yavits:2015:RAP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kang:2015:SRT, author = "Suk Chan Kang and Chrysostomos Nicopoulos and Ada Gavrilovska and Jongman Kim", title = "Subtleties of Run-Time Virtual Address Stacks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "152--155", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2337299", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The run-time virtual address (VA) stack has some unique properties, which have garnered the attention of researchers. The stack one-dimensionally grows and shrinks at its top, and contains data that is seemingly local/private to one thread, or process. Most prior related research has focused on these properties. However, this article aims to demonstrate how conventional wisdom pertaining to the run-time VA stack fails to capture some critical subtleties and complexities. We first explore two widely established assumptions surrounding the VA stack area: (1) Data accesses can be classified as falling either under VA-stack-area accesses, or non-stack-area accesses, with no aliasing; (2) The VA stack data is completely private and invisible to other threads/processes. Subsequently, we summarize a representative selection of related work that pursued the micro-architectural concept of using run-time VA stacks to extend the general-purpose register file. We then demonstrate why these assumptions are invalid, by using examples from prior work to highlight the potential hazards regarding data consistency, shared memory consistency, and cache coherence. Finally, we suggest safeguards against these hazards. Overall, we explore the function-critical issues that future operating systems and compilers should address to effectively reap all the benefits of using run-time VA stacks.", acknowledgement = ack-nhfb, affiliation = "Kang, SC (Reprint Author), Georgia Inst Technol, Atlanta, GA 30332 USA. Kang, Suk Chan; Gavrilovska, Ada; Kim, Jongman, Georgia Inst Technol, Atlanta, GA 30332 USA. Nicopoulos, Chrysostomos, Univ Cyprus, CY-1678 Nicosia, Cyprus.", ajournal = "IEEE Comput. Archit. Lett.", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache coherence; cache storage; data consistency; data decoupling; data integrity; data privacy; function-critical issue; general-purpose register file; Instruction sets; memory consistency; microarchitectural concept; nonstack-area access; register file; Run time; Run-time stack; run-time VA stack data access; run-time virtual address stack; shared memory; shared memory consistency; shared memory systems; synonym page; VA-stack-area accesses; Virtualization", number-of-cited-references = "12", ORCID-numbers = "Nicopoulos, Chrysostomos/0000-0001-6389-6068", research-areas = "Computer Science", times-cited = "0", unique-id = "Kang:2015:SRT", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Rodopoulos:2015:TPV, author = "Dimitrios Rodopoulos and Francky Catthoor and Dimitrios Soudris", title = "Tackling Performance Variability Due to {RAS} Mechanisms with {PID}-Controlled {DVFS}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "156--159", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2385713", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "As technology nodes approach deca-nanometer dimensions, many phenomena threaten the binary correctness of processor operation. Computer architects typically enhance their designs with reliability, availability and serviceability (RAS) schemes to correct such errors, in many cases at the cost of extra clock cycles, which, in turn, leads to processor performance variability. The goal of the current paper is to absorb this variability using Dynamic Voltage and Frequency Scaling (DVFS). A closed-loop implementation is proposed, which configures the clock frequency based on observed metrics that encapsulate performance variability due to RAS mechanisms. That way, performance dependability and predictability is achieved. We simulate the transient and steady state behavior of our approach, reporting responsiveness within less than 1 ms. We also assess our idea using the power model of real processor and report a maximum energy overhead of roughly 10 percent for dependable performance in the presence of RAS temporal overheads.", acknowledgement = ack-nhfb, affiliation = "Rodopoulos, D (Reprint Author), Natl Tech Univ Athens, MicroLab, Sch Elect \& Comp Engn, Athens 15780, Greece. Rodopoulos, Dimitrios; Soudris, Dimitrios, Natl Tech Univ Athens, MicroLab, Sch Elect \& Comp Engn, Athens 15780, Greece. Catthoor, Francky, ESAT KU Leuven, Leuven, Belgium. Catthoor, Francky, SSET IMEC, Leuven, Belgium.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "drodo@microlab.ntua.gr catthoor@imec.be dsoudris@microlab.ntua.gr", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "HARPA EC project [FP7-612069]", funding-text = "The authors thank Prof. Y. Sazeides and Prof. C. Nicopoulos of UCY, Cyprus for the insightful discussions. They also acknowledge the constructive feedback of the reviewers. This work was partially supported by the FP7-612069-HARPA EC project. Dimitrios Rodopoulos is the corresponding author. Finally, the authors acknowledge conversations with Dr. Antonis Papanikolaou.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "availability; availability and serviceability; Availability and Serviceability; availability and serviceability; binary correctness; closed loop systems; closed-loop implementation; computer architects; computer architecture; deca-nanometer dimensions; Dynamic voltage and frequency scaling; dynamic voltage and frequency scaling; Dynamic voltage and frequency scaling; Dynamic Voltage and Frequency Scaling; Mathematical model; microcomputers; Performance evaluation; performance variability; performance vulnerability factor; Performance Vulnerability Factor; PID-controlled DVFS; Process control; processor operation; RAS mechanisms; reliability; Reliability; reliability; Reliability; serviceability; three-term control; Voltage control", number-of-cited-references = "21", ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847", research-areas = "Computer Science", researcherid-numbers = "Soudris, Dimitrios/O-8843-2019", times-cited = "4", unique-id = "Rodopoulos:2015:TPV", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Markovic:2015:TLS, author = "Nikola Markovic and Daniel Nemirovsky and Osman Unsal and Mateo Valero and Adrian Cristal", title = "Thread Lock Section-Aware Scheduling on Asymmetric Single-{ISA} Multi-Core", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "160--163", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2357805", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "As thread level parallelism in applications has continued to expand, so has research in chip multi-core processors. As more and more applications become multi-threaded we expect to find a growing number of threads executing on a machine. As a consequence, the operating system will require increasingly larger amounts of CPU time to schedule these threads efficiently. Instead of perpetuating the trend of performing more complex thread scheduling in the operating system, we propose a scheduling mechanism that can be efficiently implemented in hardware as well. Our approach of identifying multi-threaded application bottlenecks such as thread synchronization sections complements the Fairness-aware Scheduler method. It achieves an average speed up of 11.5 percent (geometric mean) compared to the state-of-the-art Fairness-aware Scheduler.", acknowledgement = ack-nhfb, affiliation = "Markovic, N (Reprint Author), Barcelona Supercomputing Ctr, Barcelona, Spain. Markovic, Nikola; Nemirovsky, Daniel; Unsal, Osman; Valero, Mateo, Barcelona Supercomputing Ctr, Barcelona, Spain. Markovic, Nikola; Nemirovsky, Daniel; Valero, Mateo, Univ Politecn Cataluna, Barcelona, Spain. Cristal, Adrian, Univ Politecn Cataluna, Barcelona Supercomputing Ctr, E-08028 Barcelona, Spain. Cristal, Adrian, Artificial Intelligence Res Inst Spanish Natl Res, Barcelona, Spain.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "nikola.markovic@bsc.es daniel.nemirovsky@bsc.es osman.unsal@bsc.es mateo.valero@bsc.es adrian.cristal@bsc.es", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Asymmetric chip multiprocessor (ACMP); asymmetric single-ISA multicore processor; chip multicore processors; Context modeling; fairness-aware scheduler method; HW/SW thread scheduling; Instruction sets; microprocessor chips; multi-threaded applications; multi-threading; Multicore processing; multiprocessing systems; multithreaded application; operating system; Operating systems; operating systems (computers); scheduling; Scheduling; Synchronization; thread lock section-aware scheduling mechanism; thread synchronization", number-of-cited-references = "17", ORCID-numbers = "UNSAL, OSMAN/0000-0002-0544-9697 Valero, Mateo/0000-0003-2917-2482", research-areas = "Computer Science", researcherid-numbers = "UNSAL, OSMAN/B-9161-2016 Valero, Mateo/L-5709-2014", times-cited = "7", unique-id = "Markovic:2015:TLS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Pekhimenko:2015:TAC, author = "Gennady Pekhimenko and Evgeny Bolotin and Mike O'Connor and Onur Mutlu and Todd C. Mowry and Stephen W. Keckler", title = "Toggle-Aware Compression for {GPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "164--168", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2430853", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Memory bandwidth compression can be an effective way to achieve higher system performance and energy efficiency in modern data-intensive applications by exploiting redundancy in data. Prior works studied various data compression techniques to improve both capacity (e.g., of caches and main memory) and bandwidth utilization (e.g., of the on-chip and off-chip interconnects). These works addressed two common shortcomings of compression: (i) compression/decompression overhead in terms of latency, energy, and area, and (ii) hardware complexity to support variable data size. In this paper, we make the new observation that there is another important problem related to data compression in the context of the communication energy efficiency: transferring compressed data leads to a substantial increase in the number of bit toggles (communication channel switchings from 0 to 1 or from 1 to 0). This, in turn, increases the dynamic energy consumed by on-chip and off-chip buses due to more frequent charging and discharging of the wires. Our results, for example, show that the bit toggle count increases by an average of 2.2x with some compression algorithms across 54 mobile GPU applications. We characterize and demonstrate this new problem across a wide variety of 221 GPU applications and six different compression algorithms. To mitigate the problem, we propose two new toggle-aware compression techniques: energy control and Metadata Consolidation. These techniques greatly reduce the bit toggle count impact of the six data compression algorithms we examine, while keeping most of their bandwidth reduction benefits.", acknowledgement = ack-nhfb, affiliation = "Pekhimenko, G (Reprint Author), Carnegie Mellon Univ, Dept Comp Sci, Pittsburgh, PA 15206 USA. Pekhimenko, Gennady; Mutlu, Onur; Mowry, Todd C., Carnegie Mellon Univ, Dept Comp Sci, Pittsburgh, PA 15206 USA. Bolotin, Evgeny; O'Connor, Mike; Keckler, Stephen W., NVIDA, Santa Clara, CA USA. O'Connor, Mike; Keckler, Stephen W., Univ Texas Austin, Austin, TX 78712 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "gpekhimento@gmail.com ebolotin@nvidia.com moconnor@nvidia.com omutlu@gmail.com tcm@cs.cmu.edu skeckler@nvidia.com", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Intel Science and Technology Center for Cloud Computing; US National Science Foundation [1212962, 1409723, 1423172]; US Department of Energy", funding-text = "The authors acknowledge the support of Intel Science and Technology Center for Cloud Computing; US National Science Foundation grants 1212962, 1409723, and 1423172; and the US Department of Energy.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "bandwidth utilization; bit toggle count impact; bit toggles; Communication channels; communication energy efficiency; Compression algorithms; compression/decompression overhead; Data compression; data compression; data compression algorithms; data compression techniques; Data compression, interconnected systems, memory; data redundancy; dynamic energy; energy control; graphics processing units; Graphics processing units; hardware complexity; interconnected systems; memory; memory bandwidth compression; metadata consolidation; Mobile communication; mobile GPU applications; modern data-intensive applications; off-chip buses; on-chip buses; power aware computing; System-on-chip; toggle-aware compression; variable data size", number-of-cited-references = "29", research-areas = "Computer Science", times-cited = "2", unique-id = "Pekhimenko:2015:TAC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2015:TCb, author = "Anonymous", title = "Table of Contents", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "C1--C1", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2510172", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2015:ICAc, author = "Anonymous", title = "{{\booktitle{IEEE Computer Architecture Letters}} Editorial Board}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "C2--C2", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2510173", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2015:ICAd, author = "Anonymous", title = "{{\booktitle{IEEE Computer Architecture Letters}}} Information for Authors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "C3--C3", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2510174", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2015:ICSb, author = "Anonymous", title = "{IEEE Computer Society}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "C4--C4", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2510176", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Wu:2016:MCN, author = "Wo-Tak Wu and Ahmed Louri", title = "A Methodology for Cognitive {NoC} Design", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "1--4", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2447535", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The number of cores in a multicore chip design has been increasing in the past two decades. The rate of increase will continue for the foreseeable future. With a large number of cores, the on-chip communication has become a very important design consideration. The increasing number of cores will push the communication complexity level to a point where managing such highly complex systems requires much more than what designers can anticipate for. We propose a new design methodology for implementing a cognitive network-on-chip that has the ability to recognize changes in the environment and to learn new ways to adapt to the changes. This learning capability provides a way for the network to manage itself. Individual network nodes work autonomously to achieve global system goals, e.g., low network latency, higher reliability, power efficiency, adaptability, etc. We use fault-tolerant routing as a case study. Simulation results show that the cognitive design has the potential to outperform the conventional design for large applications. With the great inherent flexibility to adopt different algorithms, the cognitive design can be applied to many applications.", acknowledgement = ack-nhfb, affiliation = "Wu, WT (Reprint Author), Univ Arizona, Dept Elect \& Comp Engn, Tucson, AZ 85721 USA. Wu, Wo-Tak; Louri, Ahmed, Univ Arizona, Dept Elect \& Comp Engn, Tucson, AZ 85721 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "wotakwu@email.arizona.edu louri@ece.arizona.edu", da = "2019-06-20", doc-delivery-number = "DY1XQ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "adaptive; Algorithm design and analysis; cognitive network-on-chip; cognitive NoC design; cognitive process; communication complexity; communication complexity level; Fault tolerance; fault tolerant computing; Fault tolerant systems; fault-tolerant; fault-tolerant routing; individual network nodes; integrated circuit design; intelligent agent; learning (artificial intelligence); learning capability; machine learning; multicore; multicore chip design; Multicore processing; multiprocessing systems; network routing; network-on-chip; NoC; on-chip communication; Routing; Software", number-of-cited-references = "8", research-areas = "Computer Science", times-cited = "1", unique-id = "Wu:2016:MCN", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2016:IICa, author = "Anonymous", title = "2015 Index {{\booktitle{IEEE Computer Architecture Letters}}} Vol. 14", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "1--6", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2513858", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Indexes", } @Article{Anonymous:2016:IICb, author = "Anonymous", title = "2015 Index {{\booktitle{IEEE Computer Architecture Letters}}} Vol. 14", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "1--6", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2513858", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 08:36:31 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Presents the 2015 author/subject index for this publication.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Indexes", } @Article{Rezaei:2016:DRS, author = "Seyyed Hossein Seyyedaghaei Rezaei and Abbas Mazloumi and Mehdi Modarressi and Pejman Lotfi-Kamran", title = "Dynamic Resource Sharing for High-Performance {$3$-D} Networks-on-Chip", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "5--8", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2448532", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "3D logic-on-logic technology is a promising approach for extending the validity of Moore's law when technology scaling stops. 3D technology can also lead to a paradigm shift in on-chip communication design by providing orders of magnitude higher bandwidth and lower latency for inter-layer communication. To turn the 3D technology bandwidth and latency benefits into network latency reductions and performance improvement, we need networks-on-chip (NoCs) that are specially designed to take advantage of what 3D technology has to offer. While in parallel workloads many packets experience blocking in the network due to losing arbitration for crossbars' input/output ports, we observe that in a considerable fraction of these cases in a 3D NoC, the corresponding input and output ports of the crossbar in the above or below router are idle. Given this observation, we propose FRESH, a router microarchitecture with Fine-grained 3D REsource SHaring capability that leverages the ultra-low latency vertical links of a 3D chip to share crossbars and links at a fine granularity between vertically stacked routers. It enables packets that lose arbitration for crossbars' input/output ports to use idle resources of the above or below routers, and effectively eliminates the unnecessary packet blocking time. We will show that our proposal lowers network latency by up to 21 percent over the state-of-the-art 3D NoC.", acknowledgement = ack-nhfb, affiliation = "Rezaei, SHS (Reprint Author), Univ Tehran, Coll Engn, Dept Elect \& Comp Engn, Tehran, Iran. Rezaei, Seyyed Hossein Seyyedaghaei; Mazloumi, Abbas; Modarressi, Mehdi, Univ Tehran, Coll Engn, Dept Elect \& Comp Engn, Tehran, Iran. Lotfi-Kamran, Pejman, Inst Res Fundamental Sci IPM, Sch Comp Sci, Tehran, Iran.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "s.hseyyedaghaei@ut.ac.ir y.mazloomi@gmail.com modarressi@ut.ac.ir plotfi@ipm.ir", da = "2019-06-20", doc-delivery-number = "DY1XQ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "3-D integration; 3D integration; 3D networks-on-chip; 3D NoC; Bandwidth; crossbars input-output ports; fine-grained 3D resource sharing capability; FRESH; network latency; network routing; network-on-chip; Ports (Computers); Resource management; Resource sharing; router microarchitecture; Routing; Switches; Three-dimensional displays; Through-silicon vias", keywords-plus = "3D; ROUTER", number-of-cited-references = "12", research-areas = "Computer Science", times-cited = "4", unique-id = "Rezaei:2016:DRS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Gorgues:2016:EPC, author = "Miguel Gorgues and Jose Flich", title = "End-Point Congestion Filter for Adaptive Routing with Congestion-Insensitive Performance", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "9--12", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2429130", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Interconnection networks are a critical component in most modern systems nowadays. Both off-chip networks, in HPC systems, data centers, and cloud servers, and on-chip networks, in chip multiprocessors (CMPs) and multiprocessors system-on-chip (MPSoCs), play an increasing role as their performance is vital for the performance of the whole system. One of the key components of any interconnect is the routing algorithm, which steers packets through the network. Adaptive routing algorithms have demonstrated their superior performance by maximizing network resources utilization. However, as systems increase in size (both in off-chip and on-chip), new problems emerge. One of them is congestion where traffic jams inside the network lead to low throughput and high packet latency, significantly impacting overall system performance. We propose a mechanism to eradicate this phenomena and to allow adaptive routing algorithms to achieve the expected performance even in the presence of congestion situations. End-Point Congestion Filter, EPC, detects congestion formed at the end-points of the network, and prevents the congestion from spreading through the network. Basically, EPC disables adaptivity in congested packets. Preliminary results for mid and high congestion situations show EPC is able to totally decouple congestion from routing.", acknowledgement = ack-nhfb, affiliation = "Gorgues, M (Reprint Author), Univ Politecn Valencia, Dept Comp Architecture, E-46022 Valencia, Spain. Gorgues, Miguel; Flich, Jose, Univ Politecn Valencia, Dept Comp Architecture, E-46022 Valencia, Spain.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "migoral@disca.upv.es jflich@disca.upv.es", da = "2019-06-20", doc-delivery-number = "DY1XQ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Adaptive filters; Adaptive routing algorithms; adaptive routing algorithms; Adaptive routing algorithms; adaptive routing algorithms; Adaptive routing algorithms; Adaptive systems; chip multiprocessors; cloud servers; CMP; Congestion; congestion; Congestion; congestion; congestion-insensitive performance; data centers; digital filters; end-point congestion filter; EPC; HPC systems; Information filters; interconnection networks; interconnects; MPSoC; multiprocessor interconnection networks; multiprocessors system-on-chip; network resources utilization; network routing; on-chip networks; packet latency; performance evaluation; Ports (Computers); Routing; system-on-chip; Throughput; traffic jams", keywords-plus = "NETWORKS", number-of-cited-references = "10", research-areas = "Computer Science", times-cited = "0", unique-id = "Gorgues:2016:EPC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Panda:2016:EPP, author = "Biswabandan Panda and Shankar Balachandran", title = "Expert Prefetch Prediction: an Expert Predicting the Usefulness of Hardware Prefetchers", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "13--16", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2428703", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Hardware prefetching improves system performance by hiding and tolerating the latencies of lower levels of cache and off-chip DRAM. An accurate prefetcher improves system performance whereas an inaccurate prefetcher can cause cache pollution and consume additional bandwidth. Prefetch address filtering techniques improve prefetch accuracy by predicting the usefulness of a prefetch address and based on the outcome of the prediction, the prefetcher decides whether or not to issue a prefetch request. Existing techniques use only one signature to predict the usefulness of a prefetcher but no single predictor works well across all the applications. In this work, we propose weighted-majority filter, an expert way of predicting the usefulness of prefetch addresses. The proposed filter is adaptive in nature and uses the prediction of the best predictor(s) from a pool of predictors. Our filter is orthogonal to the underlying prefetching algorithm. We evaluate the effectiveness of our technique on 22 SPEC-2000/2006 applications. On an average, when employed with three state-of-the-art prefetchers such as AMPM, SMS, and GHB-PC/DC, our filter provides performance improvement of 8.1, 9.3, and 11 percent respectively.", acknowledgement = ack-nhfb, affiliation = "Panda, B (Reprint Author), Indian Inst Technol, Dept Comp Sci \& Engn, Madras, Tamil Nadu, India. Panda, Biswabandan; Balachandran, Shankar, Indian Inst Technol, Dept Comp Sci \& Engn, Madras, Tamil Nadu, India.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "biswa.uce@gmail.com", da = "2019-06-20", doc-delivery-number = "DY1XQ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Accuracy; AMPM; cache; Cache; cache; Cache; cache; cache storage; filtering theory; GHB-PC/DC; Hardware; hardware prefetchers; Hardware prefetching; Hardware Prefetching; Hardware prefetching; Hardware Prefetching; Memory systems; memory systems; Memory systems; memory systems; Pollution; Prediction algorithms; prefetch addresses; Prefetching; prefetching algorithm; Radiation detectors; Random access memory; SMS; weighted-majority filter", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "2", unique-id = "Panda:2016:EPP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Eker:2016:EEC, author = "Abdulaziz Eker and O{\u{g}}uz Ergin", title = "Exploiting Existing Copies in Register File for Soft Error Correction", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "17--20", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2435705", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Soft errors are an increasingly important problem in contemporary digital systems. Being the major data holding component in contemporary microprocessors, the register file has been an important part of the processor on which researchers offered many different schemes to protect against soft errors. In this paper we build on the previously proposed schemes and start with the observation that many register values already have a replica inside the storage space. We use this already available redundancy inside the register file in combination with a previously proposed value replication scheme for soft error detection and correction. We show that, by employing schemes that make use of the already available copies of the values inside the register file, it is possible to detect and correct 39.0 percent of the errors with an additional power consumption of 18.9 percent.", acknowledgement = ack-nhfb, affiliation = "Eker, A (Reprint Author), TOBB Univ Econ \& Technol, Dept Comp Engn, Ankara, Turkey. Eker, Abdulaziz; Ergin, O{\u{g}}uz, TOBB Univ Econ \& Technol, Dept Comp Engn, Ankara, Turkey.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "aeker@etu.edu.tr oergin@etu.edu.tr", da = "2019-06-20", doc-delivery-number = "DY1XQ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "TUBITAK [112E004]", funding-text = "This work was supported in part by TUBITAK under Grant 112E004. The work is in the framework of COST Action 1103.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; contemporary digital systems; contemporary microprocessors; data holding component; Error correction; Error correction codes; microcomputers; microprocessor architecture; Microprocessors; Parity check codes; redundancy; register file; Registers; Reliability; soft error; soft error correction; soft error detection; storage space", number-of-cited-references = "16", ORCID-numbers = "Ergin, O{\u{g}}uz/0000-0003-2701-3787", research-areas = "Computer Science", researcherid-numbers = "Ergin, O{\u{g}}uz/E-5717-2010", times-cited = "1", unique-id = "Eker:2016:EEC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Maycock:2016:HES, author = "Matthew Maycock and Simha Sethumadhavan", title = "Hardware Enforced Statistical Privacy", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "21--24", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2403359", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The Internet of Things will result in users generating vast quantities of data, some of it sensitive. Results from the statistical analysis of sensitive data across wide ranges of demographics will become ever more useful to data analysts and their clients. The competing needs of the two groups-data generators with their desire for privacy and analysts with their desire for inferred statistics-will be met through the use of statistical privacy techniques. The question, then, is how can we ensure that the statistical methods are applied in a trustable manner? In this paper we discuss some of the complications and consequences of ensuring both trust and privacy through the immutability of hardware, providing a desiderata for a hardware privacy platform.", acknowledgement = ack-nhfb, affiliation = "Maycock, M (Reprint Author), Columbia Univ, Dept Comp Sci, CASTL, New York, NY 10027 USA. Maycock, Matthew; Sethumadhavan, Simha, Columbia Univ, Dept Comp Sci, CASTL, New York, NY 10027 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "mhm2159@columbia.edu simha@columbia.edu", da = "2019-06-20", doc-delivery-number = "DY1XQ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Alfred P. Sloan Foundation; [FA8750-10-2-0253]", funding-text = "This work was supported through grant FA8750-10-2-0253 and the Alfred P. Sloan Foundation. Opinions, findings, conclusions and recommendations expressed in this material are those of the authors and may not reflect the views of the funding entities. Simha Sethumadhavan is the corresponding author.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "data analysis; Data privacy; data privacy; Data privacy; Engines; Hardware; hardware enforced statistical privacy; hardware immutability; hardware support; Hardware Support; hardware support; Hardware Support; hardware support; Internet of things; Internet of Things; Internet of things; Internet of Things; Internet of things; Internet of Things; Noise; Privacy; privacy; Privacy; privacy; Privacy; privacy; Privacy; privacy protection unit; Privacy Protection Unit; privacy protection unit; Privacy Protection Unit; privacy protection unit; Security; sensitive data; Software; statistical analysis", number-of-cited-references = "7", research-areas = "Computer Science", times-cited = "1", unique-id = "Maycock:2016:HES", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Li:2016:ICL, author = "Dongdong Li and Tor M. Aamodt", title = "Inter-Core Locality Aware Memory Scheduling", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "25--28", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2435709", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Graphics Processing Units (GPUs) run thousands of parallel threads and achieve high Memory Level Parallelism (MLP). To support high Memory Level Parallelism, a structure called a Miss-Status Holding Register (MSHR) handles multiple in-flight miss requests. When multiple cores send requests to the same cache line, the requests are merged into one last level cache MSHR entry and only one memory request is sent to the Dynamic Random-Access Memory (DRAM). We call this inter-core locality. The main reason for inter-core locality is that multiple cores access shared read-only data within the same cache line. By prioritizing memory requests that have high inter-core locality, more threads resume execution. In this paper, we analyze the reason for inter-core locality and show that requests with inter-core locality are more critical to performance. We propose a GPU DRAM scheduler that exploits information about inter-core locality detected at the last level cache MSHRs. For high inter-core locality benchmarks this leads to an average 28 percent reduction in memory request latency and 11 percent improvement in performance.", acknowledgement = ack-nhfb, affiliation = "Li, DD (Reprint Author), Univ British Columbia, Dept Elect \& Comp Engn, Vancouver, BC V6T 1Z4, Canada. Li, Dongdong; Aamodt, Tor M., Univ British Columbia, Dept Elect \& Comp Engn, Vancouver, BC V6T 1Z4, Canada.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "dongdong@ece.ubc.ca aamodt@ece.ubc.ca", da = "2019-06-20", doc-delivery-number = "DY1XQ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Benchmark testing; cache line; cache storage; Computational fluid dynamics; DRAM chips; dynamic random-access memory; GPGPU; GPU DRAM scheduler; graphics processing units; Graphics processing units; graphics processing units; Graphics processing units; graphics processing units; Instruction sets; intercore locality aware memory scheduling; last level cache MSHR entry; locality; Locality; locality; Locality; locality; memory access scheduling; Memory Access Scheduling; memory access scheduling; Memory Access Scheduling; memory level parallelism; memory request; memory request latency; miss-status holding register; MLP; multiple cores; multiple in-flight miss requests; multiprocessing systems; parallel processing; parallel threads; Processor scheduling; processor scheduling; Processor scheduling; processor scheduling; Random access memory; read-only data", number-of-cited-references = "16", research-areas = "Computer Science", times-cited = "4", unique-id = "Li:2016:ICL", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Pu:2016:NIP, author = "Libei Pu and Kshitij Doshi and Ellis Giles and Peter Varman", title = "Non-Intrusive Persistence with a Backend {NVM} Controller", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "29--32", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2443105", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "By providing instruction-grained access to vast amounts of persistent data with ordinary loads and stores, byte-addressable storage class memory (SCM) has the potential to revolutionize system architecture. We describe a non-intrusive SCM controller for achieving light-weight failure atomicity through back-end operations. Our solution avoids costly software intervention by decoupling isolation and concurrency-driven atomicity from failure atomicity and durability, and does not require changes to the front-end cache hierarchy. Two implementation alternatives --- one using a hardware structure, and the other extending the memory controller with a firmware managed volatile space, are described.", acknowledgement = ack-nhfb, affiliation = "Pu, LB (Reprint Author), Rice Univ, ECE, Houston, TX 77005 USA. Pu, Libei; Giles, Ellis; Varman, Peter, Rice Univ, ECE, Houston, TX 77005 USA. Doshi, Kshitij, Intel, SSG, Phoenix, AZ 85226 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "pulibei@gmail.com kshitij.a.doshi@intel.com erg@rice.edu pjv@rice.edu", da = "2019-06-20", doc-delivery-number = "DY1XQ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "US National Science Foundation (NSF) [CCF 1439075]; Intel Software and Services Group", funding-text = "This paper is supported by the US National Science Foundation (NSF) Grant CCF 1439075 and by Intel Software and Services Group.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "atomicity; backend NVM controller; byte-addressable storage class memory; cache storage; concurrency-driven atomicity; consistency; durability; failure analysis; firmware; firmware managed volatile space; front-end cache hierarchy; Hardware; hardware structure; instruction-grained access; isolation decoupling; light-weight failure atomicity; memory architecture; Memory management; Non-volatile memory; nonintrusive persistence; nonintrusive SCM controller; Nonvolatile memory; persistent memory; Process control; Random access memory; random-access storage; Retirement; Software; software intervention; system architecture", keywords-plus = "SYSTEM", number-of-cited-references = "14", research-areas = "Computer Science", times-cited = "1", unique-id = "Pu:2016:NIP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Garcia:2016:CMP, author = "P. Garcia and T. Gomes and J. Monteiro and A. Tavares and M. Ekpanyapong", title = "On-Chip Message Passing Sub-System for Embedded Inter-Domain Communication", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "33--36", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2419260", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "This letter describes the architecture of an inter-domain message passing hardware sub-system targeting the embedded virtualization field. Embedded virtualization is characterized by application-specific solutions, where functionality is partitioned into a small, fixed number of Virtual Machines, typically under real-time constraints, which must communicate for synchronization and status signaling. In light of the growing use of custom hardware, especially supported by (re)configurable platforms, we show how our hardware sub-system can provide virtualization-safe data transfers, without the need for Hypervisor (software) mediation, through the use of translate-once and virtual-interface hardware mechanisms, allowing direct memory-to-memory copies between different partitions' input/output buffers, in both direct-transfer and publish-subscribe modes. Our experiments show our architecture is especially suited for the real time domain, outperforming an equivalent software solution in latencies, throughput and jitter, and outperforming state of the art hardware solutions for small message sizes ($ < 512 $ B).", acknowledgement = ack-nhfb, affiliation = "Garcia, P (Reprint Author), Univ Minho, Dept Ctr Algoritmi, P-4800 Braga, Portugal. Garcia, P.; Gomes, T.; Monteiro, J.; Tavares, A., Univ Minho, Dept Ctr Algoritmi, P-4800 Braga, Portugal. Ekpanyapong, M., Asian Inst Technol, Dept Microelect \& Embedded Syst, Khlong Luang, Thailand.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "pgarcia@dei.uminho.pt tgomes@dei.uminho.pt jmonteiro@dei.uminho.pt atavares@dei.uminho.pt mongkol@ait.ac.th", da = "2019-06-20", doc-delivery-number = "DY1XQ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "FCT [SFRH/BD/77813/2011]", funding-text = "This work was supported in part by a grant from FCT, reference SFRH/BD/77813/2011. P. Garcia is the corresponding author.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "application program interfaces; application-specific solutions; configurable platforms; direct memory-to-memory copies; direct-transfer modes; embedded interdomain communication; embedded systems; embedded virtualization field; Hardware; interdomain message passing hardware subsystem; message passing; Message passing; on-chip message passing subsystem; partition input/output buffers; publish subscribe modes; Publish-subscribe; real time domain; real-time constraints; Software; status signaling; synchronisation; synchronization; Throughput; translate-once mechanism; Virtual machine monitors; virtual machines; virtual-interface hardware mechanisms; virtualisation; Virtualization; virtualization-safe data transfers", number-of-cited-references = "15", ORCID-numbers = "Monteiro, Joao L/0000-0002-3287-3995 Monteiro, Joao/0000-0002-3287-3995 Tavares, Adriano/0000-0001-8316-6927 Gomes, Tiago/0000-0002-8496-8179 Garcia, Paulo/0000-0002-1041-5205", research-areas = "Computer Science", researcherid-numbers = "Monteiro, Joao L/H-7751-2012 Monteiro, Joao/Q-6857-2019 Tavares, Adriano/M-5257-2013", times-cited = "1", unique-id = "Garcia:2016:CMP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Li:2016:PHP, author = "Minghua Li and Guancheng Chen and Qijun Wang and Yonghua Lin and Peter Hofstee and Per Stenstrom and Dian Zhou", title = "{PATer}: a Hardware Prefetching Automatic Tuner on {IBM} {POWER8} Processor", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "37--40", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2442972", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Hardware prefetching on IBM's latest POWER8 processor is able to improve performance of many applications significantly, but it can also cause performance loss for others. The IBM POWER8 processor provides one of the most sophisticated hardware prefetching designs which supports 225 different configurations. Obviously, it is a big challenge to find the optimal or near-optimal hardware prefetching configuration for a specific application. We present a dynamic prefetching tuning scheme in this paper, named prefetch automatic tuner (PATer). PATer uses a prediction model based on machine learning to dynamically tune the prefetch configuration based on the values of hardware performance monitoring counters (PMCs). By developing a two-phase prefetching selection algorithm and a prediction accuracy optimization algorithm in this tool, we identify a set of selected key hardware prefetch configurations that matter mostly to performance as well as a set of PMCs that maximize the machine learning prediction accuracy. We show that PATer is able to accelerate the execution of diverse workloads up to $ 1.4 \times $.", acknowledgement = ack-nhfb, affiliation = "Li, MH (Reprint Author), Unvers Texas Dallas, Dept Elect Engn, Richardson, TX 75080 USA. Li, MH (Reprint Author), IBM Res China, Beijing, Peoples R China. Li, Minghua; Zhou, Dian, Unvers Texas Dallas, Dept Elect Engn, Richardson, TX 75080 USA. Li, Minghua; Chen, Guancheng; Wang, Qijun; Lin, Yonghua, IBM Res China, Beijing, Peoples R China. Hofstee, Peter, IBM Corp, ARL, Austin, TX USA. Stenstrom, Per, Chalmers, Dept Sci \& Comp Engn, Gothenburg, Sweden.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "mxl095420@utdallas.edu chengc@cn.ibm.com wqijun@cn.ibm.com linyh@cn.ibm.com hofstee@us.ibm.com pers@chalmers.se zhoud.utdallas@gmail.com", da = "2019-06-20", doc-delivery-number = "DY1XQ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "IBM Research global summer intern program", funding-text = "The authors would like to thank the anonymous reviewers for their valuable suggestions and comments to improve the paper. The authors also want to thank Ling Shao, Xiaowei Shen, Qi Guo, Kun Wang, Tao Liu, Yan Li from IBM Research, and Sally A. Mckee from Chalmers for their insightful suggestions. Minghua Li was supported by IBM Research global summer intern program.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Accuracy; Benchmark testing; Classifier design and evaluation; Classifier design and evaluation, machine learning, optimization, performance measures; Hardware; hardware PMC; hardware prefetching automatic tuner; IBM POWER8 processor; learning (artificial intelligence); machine learning; multiprocessing systems; Optimization; optimization; Optimization; PATer; performance evaluation; performance measures; performance monitoring counters; prediction accuracy optimization algorithm; prefetch automatic tuner; Prefetching; Runtime; storage management; Training; two-phase prefetching selection algorithm", number-of-cited-references = "9", research-areas = "Computer Science", times-cited = "1", unique-id = "Li:2016:PHP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Alian:2016:PGS, author = "Mohammad Alian and Daehoon Kim and Nam Sung Kim", title = "{pd-gem5}: Simulation Infrastructure for Parallel\slash Distributed Computer Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "41--44", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2438295", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Improving the performance and power efficiency of a single processor has been fraught with various challenges stemming from the end of the classical technology scaling. Thus, the importance of efficiently running applications on a parallel/distributed computer system has continued to increase. In developing and optimizing such a parallel/distributed computer system, it is critical to study the impact of the complex interplay amongst processor, node, and network architectures on performance and power efficiency in detail. This necessitates a flexible, detailed and open-source full-system simulation infrastructure. However, our community lacks such an infrastructure. In this paper, we present pd-gem5, a gem5-based infrastructure that can model and simulate a parallel/ distributed computer system using multiple simulation hosts. Our experiment shows that pd-gem5 running on six simulation hosts speeds up the simulation of a 24-node computer system up to $ 3.2 \times $ compared with running on a single simulation host.", acknowledgement = ack-nhfb, affiliation = "Kim, NS (Reprint Author), Univ Illinois, ECE Dept, Urbana, IL 61801 USA. Alian, Mohammad; Kim, Daehoon; Kim, Nam Sung, Univ Illinois, ECE Dept, Urbana, IL 61801 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "nskim@illinois.edu", da = "2019-06-20", doc-delivery-number = "DY1XQ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [CNS-1217102, CNS-1512981]; DARPA [HR0011-12-2-0019]", funding-text = "This work was supported in part by NSF (CNS-1217102 and CNS-1512981) and DARPA (HR0011-12-2-0019) grants. Nam Sung Kim has a financial interest in Samsung Electronics and AMD. Daehoon Kim is the corresponding author.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; Computational modeling; digital simulation; gem5; Handheld computers; Load modeling; multiple simulation hosts; network; open-source full-system simulation infrastructure; parallel processing; parallel/distributed computer systems; parallel/distributed simulation; pd-gem5; power aware computing; public domain software; single processor performance; single processor power efficiency; single simulation host; Switches; Synchronization; technology scaling", number-of-cited-references = "6", research-areas = "Computer Science", times-cited = "4", unique-id = "Alian:2016:PGS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kim:2016:RFE, author = "Yoongu Kim and Weikun Yang and Onur Mutlu", title = "{Ramulator}: a Fast and Extensible {DRAM} Simulator", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "45--49", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2414456", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Recently, both industry and academia have proposed many different roadmaps for the future of DRAM. Consequently, there is a growing need for an extensible DRAM simulator, which can be easily modified to judge the merits of today's DRAM standards as well as those of tomorrow. In this paper, we present Ramulator, a fast and cycle-accurate DRAM simulator that is built from the ground up for extensibility. Unlike existing simulators, Ramulator is based on a generalized template for modeling a DRAM system, which is only later infused with the specific details of a DRAM standard. Thanks to such a decoupled and modular design, Ramulator is able to provide out-of-the-box support for a wide array of DRAM standards: DDR3/4, LPDDR3/4, GDDR5, WIO1/2, HBM, as well as some academic proposals (SALP, AL-DRAM, TL-DRAM, RowClone, and SARP). Importantly, Ramulator does not sacrifice simulation speed to gain extensibility: according to our evaluations, Ramulator is $ 2.5 \times $ faster than the next fastest simulator. Ramulator is released under the permissive BSD license.", acknowledgement = ack-nhfb, affiliation = "Kim, Y (Reprint Author), Carnegie Mellon Univ, Dept Elect \& Comp Engn, Pittsburgh, PA 15213 USA. Kim, Yoongu; Mutlu, Onur, Carnegie Mellon Univ, Dept Elect \& Comp Engn, Pittsburgh, PA 15213 USA. Yang, Weikun, Carnegie Mellon Univ, Pittsburgh, PA 15213 USA. Yang, Weikun, Peking Univ, Dept Comp Sci, Beijing, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "yoongu.kim@gmail.com wkyjyy@gmail.com omutlu@gmail.com", da = "2019-06-20", doc-delivery-number = "DY1XQ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF; SRC", funding-text = "We thank the SAFARI group members who have contributed to the development of Ramulator, including Kevin Chang, Saugata Ghose, Donghyuk Lee, Tianshi Li, and Vivek Seshadri. We also thank the anonymous reviewers for feedback. This work was supported by NSF, SRC, and gifts from our industrial partners, including Google, Intel, Microsoft, Nvidia, Samsung, Seagate and VMware. Ramulator can be freely downloaded from https://github.com/CMUSAFARI/ramulator", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "circuit simulation; digital simulation; DRAM; DRAM chips; DRAM simulator; DRAM standard; emerging technologies; experimental methods; Hardware design languages; Main memory; memory scaling; memory systems; Nonvolatile memory; performance evaluation; performance evaluation, experimental methods, emerging technologies, memory systems, memory scaling; Proposals; Ramulator; Random access memory; Runtime; simulation; software tool; standards; Standards; standards; Timing", keywords-plus = "LATENCY DRAM; RETHINKING", number-of-cited-references = "38", research-areas = "Computer Science", times-cited = "29", unique-id = "Kim:2016:RFE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Olson:2016:SIT, author = "Lena E. Olson and Simha Sethumadhavan and Mark D. Hill", title = "Security Implications of Third-Party Accelerators", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "50--53", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2445337", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Third-party accelerators offer system designers high performance and low energy without the market delay of in-house development. However, complex third-party accelerators may include vulnerabilities due to design flaws or malicious intent that are hard to expose during verification. Rather than react to each new vulnerability, it is better to proactively build defenses for classes of attacks. To inspire future work on defenses, this paper develops a taxonomy of accelerator vulnerabilities. We consider the cross product of threat types (confidentiality, integrity, and availability) with risk categories (configuration, computation, termination, accelerator memory accesses, system memory accesses, microarchitecture/coherence, exceptions/interrupts, and power), as well as whether processes can be vulnerable only if they use the offending accelerator (accelerator-scope threat) or even when running in the same system (system-scope threat). Our taxonomy draws attention to a grave problem that needs immediate attention from computer architects.", acknowledgement = ack-nhfb, affiliation = "Olson, LE (Reprint Author), Univ Wisconsin, Dept Comp Sci, 1210 W Dayton St, Madison, WI 53706 USA. Olson, Lena E.; Hill, Mark D., Univ Wisconsin, Dept Comp Sci, 1210 W Dayton St, Madison, WI 53706 USA. Sethumadhavan, Simha, Columbia Univ, Dept Comp Sci, New York, NY 10026 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "lena@cs.wisc.edu simha@cs.columbia.edu markhill@cs.wisc.edu", da = "2019-06-20", doc-delivery-number = "DY1XQ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [1054844]; Alfred P. Sloan Foundation; [FA8750-10-2-0253]; [FA8650-11-C-7190]", funding-text = "This work is supported through grants FA8750-10-2-0253, FA8650-11-C-7190, NSF 1054844 and the Alfred P. Sloan Foundation. Opinions, findings, conclusions and recommendations expressed in this material are those of the authors and may not reflect the views of the funding entities. The authors thank Eric Sedlar, Dan Gibson, Multifacet, and UW-Madison Computer Architecture Affiliates for valuable feedback.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "accelerator architectures; accelerator vulnerabilities; accelerator-scope threat; Coherence; computer architecture; Computer bugs; Computer security; Cryptography; Hardware; malicious intent; market delay; Registers; risk categories; risk management; system-scope threat; Taxonomy; third-party accelerators", number-of-cited-references = "20", oa = "Bronze", research-areas = "Computer Science", times-cited = "5", unique-id = "Olson:2016:SIT", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Jacob:2016:CVC, author = "Bruce Jacob", title = "The Case for {VLIW--CMP} as a Building Block for Exascale", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "54--57", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2424699", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Current ultra-high-performance computers execute instructions at the rate of roughly 10 PFLOPS (10 quadrillion floating-point operations per second) and dissipate power in the range of 10 MW. The next generation will need to execute instructions at EFLOPS rates-100x as fast as today's-but without dissipating any more power. To achieve this challenging goal, the emphasis is on power-efficient execution, and for this we propose VLIW-CMP as a general architectural approach that improves significantly on the power efficiency of existing solutions. Compared to manycore architectures using simple, single-issue cores, VLIW-CMP reduces both power and die area, improves single-thread performance, and maintains aggregate FLOPS per die. To improve further on the power advantages of VLIW, we describe a mechanism that reduces power dissipation of both data forwarding and register-file activity.", acknowledgement = ack-nhfb, affiliation = "Jacob, B (Reprint Author), Univ Maryland, Dept Elect \& Comp Engn, College Pk, MD 20742 USA. Jacob, Bruce, Univ Maryland, Dept Elect \& Comp Engn, College Pk, MD 20742 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "blj@ece.umd.edu", da = "2019-06-20", doc-delivery-number = "DY1XQ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; Computer architectures; Computer architectures, high-performance computing, energy efficiency, multicore; data forwarding activity; EFLOPS rates; energy efficiency; high-performance computing; manycore architectures; multicore; multiprocessing systems; parallel architectures; performance evaluation; PFLOPS; Pipelines; Ports (Computers); power aware computing; power dissipation; power-efficient execution; quadrillion floating-point operations-per-second; Radio frequency; register-file activity; Registers; single-thread performance improvement; Software; ultra-high-performance computers; VLIW; VLIW-CMP", keywords-plus = "REGISTER LIFETIME; ARCHITECTURE", number-of-cited-references = "18", research-areas = "Computer Science", times-cited = "1", unique-id = "Jacob:2016:CVC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kleanthous:2016:TML, author = "Marios Kleanthous and Yiannakis Sazeides and Emre Ozer and Chrysostomos Nicopoulos and Panagiota Nikolaou and Zacharias Hadjilambrou", title = "Toward Multi-Layer Holistic Evaluation of System Designs", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "58--61", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2445877", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The common practice for quantifying the benefit(s) of design-time architectural choices of server processors is often limited to the chip- or server-level. This quantification process invariably entails the use of salient metrics, such as performance, power, and reliability, which capture-in a tangible manner-a designs overall ramifications. This paper argues for the necessity of a more holistic evaluation approach, which considers metrics across multiple integration levels (chip, server and datacenter). In order to facilitate said comprehensive evaluation, we utilize an aggregate metric, e.g. the Total Cost of Ownership (TCO), to harness the complexly of comparing multiple metrics at multiple levels. We motivate our proposition for holistic evaluation with a case study that compares a 2D processor to a 3D processor at various design integration levels. We show that while a 2D processor is clearly the best choice at the processor level, the conclusion is reversed at the data-center level, where the 3D processor becomes a better choice. This result emanates mainly from the performance benefits of processor-DRAM 3D integration, and the ability to amortize (at the datacenter-level) the higher 3D per-server cost and lower reliability by requiring fewer 3D servers to match the same performance.", acknowledgement = ack-nhfb, affiliation = "Kleanthous, M (Reprint Author), Univ Cyprus, Dept Comp Sci, Nicosia, Cyprus. Kleanthous, Marios; Sazeides, Yiannakis; Nikolaou, Panagiota; Hadjilambrou, Zacharias, Univ Cyprus, Dept Comp Sci, Nicosia, Cyprus. Nicopoulos, Chrysostomos, Univ Cyprus, Dept Elect \& Comp Engn, Nicosia, Cyprus. Ozer, Emre, ARM Ltd, Res, Cambridge CB19NJ, England.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "marios@kleanthous.info yanos@cs.ucy.ac.cy emre.ozer@arm.com nicopoulos@ucy.ac.cy nikolaou@cs.ucy.ac.cy zhadji01@cs.ucy.ac.cy", da = "2019-06-20", doc-delivery-number = "DY1XQ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "European Commission [612069 HARPA, 247779 EuroCloud]", funding-text = "This work was supported by the European Commission FP7 projects ``Harnessing Performance Variability'' (No: 612069 HARPA) and ``Energy-conscious 3D Server-on-Chip for Green Cloud Services'' (No: 247779 EuroCloud).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "2D processor; 3D processor; Chip; chip; Chip; chip; Computational modeling; computer centres; data-center level; Datacenter; datacenter; Datacenter; datacenter; design integration levels; Design-Space Exploration; design-space exploration; Design-Space Exploration; design-space exploration; design-time architectural choices; DRAM chips; Evaluation Metrics; evaluation metrics; Evaluation Metrics; Holistic evaluation; Holistic Evaluation; Holistic evaluation; Holistic Evaluation; Holistic evaluation; integrated circuit reliability; Measurement; microprocessor chips; multilayer holistic evaluation; multiple integration levels; performance evaluation; processor-DRAM 3D integration; Program processors; ramifications; Reliability; reliability; Reliability; Server; server; Server; server processors; Servers; system designs; System-on-chip; Three-dimensional displays", keywords-plus = "PERFORMANCE", number-of-cited-references = "23", ORCID-numbers = "Nicopoulos, Chrysostomos/0000-0001-6389-6068", research-areas = "Computer Science", times-cited = "1", unique-id = "Kleanthous:2016:TML", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Daya:2016:THP, author = "Bhavya K. Daya and Li-Shiuan Peh and Anantha P. Chandrakasan", title = "Towards High-Performance Bufferless {NoCs} with {SCEPTER}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "62--65", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2428699", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "In the many-core era, the network on-chip (NoC) is playing a larger role in meeting performance, area and power goals, as router buffers contribute greatly to NoC area and power usage. Proposals have advocated bufferless NoCs, however a performance wall has been reached such that high throughput performance has not been extracted. We present SCEPTER, a high-performance bufferless mesh NoC that sets up single-cycle virtual express paths dynamically across the chip, allowing deflected packets to go through non-minimal paths with no latency penalty. For a 64 node network, we demonstrate an average 62 percent reduction in latency and an average $ 1.3 \times $ higher throughput over a baseline bufferless NoC for synthetic traffic patterns; with comparable performance to a single-cycle multihop buffered mesh network with six flit buffers, per input port, in each router.", acknowledgement = ack-nhfb, affiliation = "Daya, BK (Reprint Author), MIT, Dept EECS, 77 Massachusetts Ave, Cambridge, MA 02139 USA. Daya, Bhavya K.; Peh, Li-Shiuan; Chandrakasan, Anantha P., MIT, Dept EECS, 77 Massachusetts Ave, Cambridge, MA 02139 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "bdaya@mit.edu peh@csail.mit.edu anantha@mtl.mit.edu", da = "2019-06-20", doc-delivery-number = "DY1XQ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "64 node network; bufferless router; bypassing; Computer architecture; deflection routing; high-performance bufferless mesh NoC; latency reduction; multiprocessor interconnection; Multiprocessor interconnection; multiprocessor interconnection; multiprocessor interconnection networks; Multiprocessor interconnection, on-chip mesh networks, bufferless router, deflection routing, bypassing; network routing; network-on-chip; nonminimal paths; on-chip mesh networks; performance evaluation; Pipelines; Ports (Computers); power aware computing; power usage; Resource management; router buffers; Routing; SCEPTER; single-cycle express path traversal for efficient routing; single-cycle virtual express paths; Switches; synthetic traffic patterns; Throughput", number-of-cited-references = "10", research-areas = "Computer Science", times-cited = "4", unique-id = "Daya:2016:THP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2016:IICc, author = "Anonymous", title = "Introducing {IEEE Collabratec}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "66--66", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2578800", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2016:IICd, author = "Anonymous", title = "Introducing {IEEE Collabratec}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "66--66", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2578800", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 08:36:31 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "IEEE Collabratec is a new, integrated online community where IEEE members, researchers, authors, and technology professionals with similar fields of interest can network and collaborate, as well as create and manage content. Featuring a suite of powerful online networking and collaboration tools, IEEE Collabratec allows you to connect according to geographic location, technical interests, or career pursuits. You can also create and share a professional identity that showcases key accomplishments and participate in groups focused around mutual interests, actively learning from and contributing to knowledgeable communities. All in one place! Learn about IEEE Collabratec at ieeecollabratec.org.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2016:ENM, author = "Anonymous", title = "Experience the Newest and Most Advanced Thinking in Big Data Analytics", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "67--67", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2581058", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Advertisement, IEEE.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2016:ICS, author = "Anonymous", title = "{{\booktitle{IEEE Cyber Security}}}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "68--68", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2581078", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Advertisement, IEEE.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2016:TCa, author = "Anonymous", title = "Table of Contents", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "C1--C1", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2578758", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Presents the table of contents for this issue of the publication.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2016:Ca, author = "Anonymous", title = "Cover", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "C2--C2", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2578759", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2016:Cb, author = "Anonymous", title = "Cover", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "C2--C2", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2578759", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 08:36:31 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Provides a listing of board members, committee members, editors, and society officers.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2016:Cc, author = "Anonymous", title = "Cover", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "C3--C3", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2578760", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2016:Cd, author = "Anonymous", title = "Cover", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "C3--C3", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2578760", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 08:36:31 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "These instructions give guidelines for preparing papers for this publication. Presents information for authors publishing in this journal.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2016:BC, author = "Anonymous", title = "[{Back} cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "1", pages = "C4--C4", month = jan # "\slash " # jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2578761", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Presents the table of contents for this issue of the publication.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Liang:2016:CGR, author = "Shuang Liang and Shouyi Yin and Leibo Liu and Yike Guo and Shaojun Wei", title = "A Coarse-Grained Reconfigurable Architecture for Compute-Intensive {MapReduce} Acceleration", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "69--72", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2458318", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Large-scale workloads often show parallelism of different levels. which offers acceleration potential for clusters and parallel processors. Although processors such as GPGPUs and FPGAs show good performance of speedup, there is still vacancy for a low power, high efficiency and dynamically reconfigurable one, and coarse-grained reconfigurable architecture (CGRA) seems to be one possible choice. In this paper, we introduce how we use our CGRA fabric Chameleon to realize a dynamically reconfigurable acceleration to MapReduce-based (MR-based) applications. A FPGA-shell-CGRA-core (FSCC) architecture is designed for the acceleration PCI-Express board, and a programming model with compilation flow for CGRA is presented. With the supports above, a small evaluation cluster with Hadoop framework is set up, and experiments on compute-intensive applications show that the programming process is significantly simplified, with an 30-60 x speedup offered under low power.", acknowledgement = ack-nhfb, affiliation = "Yin, SY (Reprint Author), Tsinghua Univ, Inst Microelect, Beijing 100084, Peoples R China. Liang, Shuang; Yin, Shouyi; Liu, Leibo; Wei, Shaojun, Tsinghua Univ, Inst Microelect, Beijing 100084, Peoples R China. Guo, Yike, Imperial Coll London, Dept Comp, London, England.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "s-liang11@mails.tsinghua.edu.cn yinsy@tsinghua.edu.cn liulb@mail.tsinghua.edu.cn fiascoo@gmail.com wsj@tsinghua.edu.cn", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Nature Science foundation of China [61274131]; International S\&T Cooperation Project of China [2012DFA11170]; Tsinghua Indigenous Research Project [20111080997]; China National High Technologies Research Program [2012-AA012701]", funding-text = "This work was supported by the National Nature Science foundation of China (No. 61274131), the International S\&T Cooperation Project of China (No. 2012DFA11170), the Tsinghua Indigenous Research Project (No. 20111080997) and the China National High Technologies Research Program (No. 2012-AA012701). S. Yin is the corresponding author.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "accelerator; Accelerators; Computer architecture; Field programmable gate arrays; Hardware; MapReduce; Programming; Reconfigurable architectures; Reconfigurable computing; Servers", number-of-cited-references = "15", research-areas = "Computer Science", times-cited = "1", unique-id = "Liang:2016:CGR", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Lai:2016:QMD, author = "Bo-Cheng Charles Lai and Luis Garrido Platero and Hsien-Kai Kuo", title = "A Quantitative Method to Data Reuse Patterns of {SIMT} Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "73--76", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2491279", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Understanding data reuse patterns of a computing system is crucial to effective design optimization. The emerging Single Instruction Multiple Threads (SIMT) processor adopts a programming model that is fundamentally disparate from conventional scalar processors. There is a lack of analytical approaches to quantify the data reuse of SIMT applications. This paper presents a quantitative method to study the data reuse inherent to SIMT applications. A metric, Data Reuse Degree, is defined to measure the amount of reused data between memory references, and associate each data reuse degree to a temporal distance representing the virtual time of the execution process. The experiments are performed on an abstracted SIMT processor that considers the programming model and runtime specifics. The experiments illustrate diverse data reuse patterns of SIMT applications and explore the impacts of architectural limitations.", acknowledgement = ack-nhfb, affiliation = "Lai, BCC (Reprint Author), Natl Chiao Tung Univ, Dept Elect Engn, Hsinchu 300, Taiwan. Lai, Bo-Cheng Charles, Natl Chiao Tung Univ, Dept Elect Engn, Hsinchu 300, Taiwan. Platero, Luis Garrido, Barcelona Super Comp Ctr, Barcelona, Spain. Kuo, Hsien-Kai, MediaTek Inc, Hsinchu, Taiwan.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "bclai@mail.nctu.edu.tw luis.garrido.platero@gmail.com hsienkai.kuo@gmail.com", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "MOST [104-2221-E-009-079]", funding-text = "This project was supported by MOST grant 104-2221-E-009-079.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "architectural limitations; cache memory; Cache memory; computing system; data analysis; data reuse degree; data reuse patterns; design optimization; execution process; Graphics processing units; Instruction sets; Measurement; Memory management; multi-threading; Parallel architectures; Parallel architectures, cache memory, parallel processing; parallel processing; Parallel processing; programming model; scalar processors; SIMT applications; SIMT processors; single-instruction multiple-threads processors; virtual time", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "0", unique-id = "Lai:2016:QMD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Cakmakci:2016:CPG, author = "Yaman {\c{C}}akmak{\c{c}}i and Will Toms and Javier Navaridas and Mikel Lujan", title = "Cyclic Power-Gating as an Alternative to Voltage and Frequency Scaling", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "77--80", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2478784", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Dynamic Voltage and Frequency Scaling is the most commonly used power management technique in modern processors. However, the ability of an individual chip to operate under reduced supply voltage can no longer be predetermined at the design stage and may even change over time. This paper presents Cyclic Power-Gating (CPG), a novel power management strategy where the power consumption of a core can be finely controlled without scaling the supply voltage. CPG builds on state-retentive power-gating which allows the power supply to a core to be switched off and on again at high speed (tens of clock cycles) with minimal disruption to running programs. The power-gating is cyclic, by altering the ratio of time spent powered-on and off in each power-gating period the effective operating frequency and power consumption of a core can be controlled. The overheads in delay and power consumption of CPG for an out-of-order core in a 14 nm technology are accurately modelled and compared to the performance and power consumption of Voltage/Frequency pairs in the same technology. The proposed power gating method reduces average power consumption by 4 percent over voltage and frequency scaling with only a 2 percent degradation in performance.", acknowledgement = ack-nhfb, affiliation = "{\c{C}}akmak{\c{c}}i, Y (Reprint Author), Univ Manchester, Sch Comp Sci, Manchester M13 9PL, Lancs, England. {\c{C}}akmak{\c{c}}i, Yaman; Toms, Will; Navaridas, Javier; Lujan, Mikel, Univ Manchester, Sch Comp Sci, Manchester M13 9PL, Lancs, England.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "cakmakcy@cs.man.ac.uk tomsw@cs.man.ac.uk javier.navaridas@manchester.ac.uk mikel.lujan@manchester.ac.uk", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "EPSRC [DOME EP/J016330/1, PAMELA EP/K008730/1]; Royal Society University Research Fellowship; Engineering and Physical Sciences Research Council [EP/K008730/1, EP/J016330/1]", funding-text = "This work was supported by EPSRC grants DOME EP/J016330/1 and PAMELA EP/K008730/1. Mike Lujan an is funded by a Royal Society University Research Fellowship. The authors thank Timothy Jones for his comments on the draft version of this paper.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; Capacitance; Computer architecture; CPG; cyclic power-gating; Energy efficiency; frequency scaling; leakage reduction; power aware computing; power consumption; Power demand; Power efficient design; power management; power management strategy; state-retentive power-gating; Voltage measurement; voltage scaling", number-of-cited-references = "12", oa = "Bronze", ORCID-numbers = "Navaridas Palma, Javier/0000-0001-7272-6597", research-areas = "Computer Science", times-cited = "0", unique-id = "Cakmakci:2016:CPG", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Tomusk:2016:DDG, author = "Erik Tomusk and Christophe Dubach and Michael O'Boyle", title = "Diversity: a Design Goal for Heterogeneous Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "81--84", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2499739", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "A growing number of processors have CPU cores that implement the same instruction set architecture (ISA) using different microarchitectures. The underlying motivation for single-ISA heterogeneity is that a diverse set of cores can enable runtime flexibility. Modern processors are subject to strict power budgets, and heterogeneity provides the runtime scheduler with more latitude to decide the level of performance a program should have based on the amount of power that can be spent. We argue that selecting a diverse set of heterogeneous cores to enable flexible operation at runtime is a non-trivial problem due to diversity in program behavior. We further show that common evaluation methods lead to false conclusions about diversity. Finally, we suggest the KS statistical test as an evaluation metric. The KS test is the first step toward a heterogeneous design methodology that optimizes for runtime flexibility.", acknowledgement = ack-nhfb, affiliation = "Tomusk, E (Reprint Author), Univ Edinburgh, Edinburgh, Midlothian, Scotland. Tomusk, Erik; Dubach, Christophe; O'Boyle, Michael, Univ Edinburgh, Edinburgh, Midlothian, Scotland.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "e.tomusk@ed.ac.uk christophe.dubach@ed.ac.uk mob@inf.ed.ac.uk", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; Computer architecture; core selection; CPU cores; design goal; Diversity; flexibility; heterogeneity; heterogeneous cores; heterogeneous design methodology; heterogeneous processors; instruction set architecture; instruction sets; integrated circuit design; ISA; Kolmogorov-Smirnov test; KS statistical test; Measurement; metrics; Microarchitecture; microarchitectures; microprocessor chips; power aware computing; Program processors; Runtime; runtime flexibility; runtime scheduler; statistical testing", number-of-cited-references = "10", research-areas = "Computer Science", times-cited = "2", unique-id = "Tomusk:2016:DDG", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Hashemi:2016:EEB, author = "Milad Hashemi and Debbie Marr and Doug Carmean and Yale N. Patt", title = "Efficient Execution of Bursty Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "85--88", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2456013", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The performance of user-facing applications is critical to client platforms. Many of these applications are event-driven and exhibit ``bursty'' behavior: the application is generally idle but generates bursts of activity in response to human interaction. We study one example of a bursty application, web-browsers, and produce two important insights: (1) Activity bursts contain false parallelism, bringing many cores out of a deep sleep to inefficiently render a single webpage, and (2) these bursts are highly compute driven, and thus scale nearly linearly with frequency. We show average performance gains/energy reductions of 14\%/17\% respectively on real hardware by statically moving threads from multiple cores to a single core. We then propose dynamic hardware driven thread migration and scheduling enhancements that detect these bursts, leading to further benefits.", acknowledgement = ack-nhfb, affiliation = "Hashemi, M (Reprint Author), Univ Texas Austin, Elect \& Comp Engn, Austin, TX 78701 USA. Hashemi, Milad; Patt, Yale N., Univ Texas Austin, Elect \& Comp Engn, Austin, TX 78701 USA. Marr, Debbie, Intel Corp, Intel Labs, Portland, OR USA. Carmean, Doug, Microsoft, Microsoft Res, Seattle, WA USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "miladh@hps.utexas.edu debbie.marr@intel.com dcarmean@microsoft.com patt@hps.utexas.edu", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Intel Corporation; Cockrell Foundation; HPS Research Group", funding-text = "The authors thank Intel Corporation and the Cockrell Foundation for their continued generous financial support of the HPS Research Group.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Browsers; bursty applications; dynamic hardware; Energy; energy reductions; Hardware; human computer interaction; human interaction; Instruction sets; Internet; Loading; multi-threading; Multicore processing; multiple cores; multiprocessing systems; online front-ends; Operating systems; performance; performance evaluation; performance gains; power aware computing; thread migration; thread scheduling; Web-browsers; Webpage; webpages; webpages, thread scheduling", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "0", unique-id = "Hashemi:2016:EEB", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kannan:2016:EAP, author = "Sudarsun Kannan and Moinudin Qureshi and Ada Gavrilovska and Karsten Schwan", title = "Energy Aware Persistence: Reducing the Energy Overheads of Persistent Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "89--92", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2472410", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Next generation byte addressable nonvolatile memory (NVM) technologies like PCM are attractive for end-user devices as they offer memory scalability as well as fast persistent storage. In such environments, NVM's limitations of slow writes and high write energy are magnified for applications that need atomic, consistent, isolated and durable (ACID) updates. This is because, for satisfying correctness (ACI), application state must be frequently flushed from all intermediate buffers, including processor cache, and to support durability (D) guarantees, that state must be logged. This increases NVM access and more importantly results in additional CPU instructions. This paper proposes Energy Aware Persistence (EAP). To develop EAP, we first show that the energy related overheads for maintaining durability are significant. We then propose energy-efficient durability principles that mitigate those costs, an example being flexible logging that switch between performance and energy-efficient modes and a memory management technique that trades capacity for energy. Finally, we propose relaxed durability (ACI-RD) mechanism used under critical low energy conditions that do not affect correctness. The initial results for several realistic applications and benchmark show up to 2x reduction in CPU and NVM energy usage relative to a traditional ACID-based persistence.", acknowledgement = ack-nhfb, affiliation = "Kannan, S (Reprint Author), Georgia Inst Technol, Atlanta, GA 30332 USA. Kannan, Sudarsun; Qureshi, Moinudin; Gavrilovska, Ada; Schwan, Karsten, Georgia Inst Technol, Atlanta, GA 30332 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "sudarsun@gatech.edu moin@ece.gatech.edu ada@cc.gatech.edu schwan@cc.gatech.edu", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "ACI-RD mechanism; ACID; ACID updates; ACID-based persistence; atomic-consistent-isolated-durable updates; Benchmark testing; cache storage; CPU energy usage; CPU instructions; EAP; end-user devices; energy aware persistence; Energy management; energy overhead reduction; energy overheads; energy-efficient durability principles; energy-efficient modes; heap-based persistence; logging; memory management; microprocessor chips; next generation byte addressable nonvolatile memory; next generation byte addressable NVM; Nonvolatile memory; NVM; NVM access; NVM energy usage; Optimization; performance evaluation; persistent memory; power aware computing; processor cache; Random access memory; random-access storage; Resource management; storage management", keywords-plus = "PHASE-CHANGE MEMORY", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "0", unique-id = "Kannan:2016:EAP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Valero:2016:ELD, author = "Alejandro Valero and Negar Miralaei and Salvador Petit and Julio Sahuquillo and Timothy M. Jones", title = "Enhancing the {L1} Data Cache Design to Mitigate {HCI}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "93--96", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2460736", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Over the lifetime of a microprocessor, the Hot Carrier Injection (HCI) phenomenon degrades the threshold voltage, which causes slower transistor switching and eventually results in timing violations and faulty operation. This effect appears when the memory cell contents flip from logic `0' to `1' and vice versa. In caches, the majority of cell flips are concentrated into only a few of the total memory cells that make up each data word. In addition, other researchers have noted that zero is the most commonly-stored data value in a cache, and have taken advantage of this behavior to propose data compression and power reduction techniques. Contrary to these works, we use this information to extend the lifetime of the caches by introducing two microarchitectural techniques that spread and reduce the number of flips across the first-level (L1) data cache cells. Experimental results show that, compared to the conventional approach, the proposed mechanisms reduce the highest cell flip peak up to 65.8 percent, whereas the threshold voltage degradation savings range from 32.0 to 79.9 percent depending on the application.", acknowledgement = ack-nhfb, affiliation = "Valero, A (Reprint Author), Univ Politecn Valencia, Dept Comp Engn, Valencia, Spain. Valero, Alejandro; Petit, Salvador; Sahuquillo, Julio, Univ Politecn Valencia, Dept Comp Engn, Valencia, Spain. Miralaei, Negar; Jones, Timothy M., Univ Cambridge, Comp Lab, Cambridge, England.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "alvabre@gap.upv.es negar.miralaei@cl.cam.ac.uk spetit@disca.upv.es jsahuqui@disca.upv.es timothy.jones@cl.cam.ac.uk", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Spanish Ministerio de Economia y Competitividad (MINECO); FEDER funds [TIN2012-38341-C04-01]; Intel Early Career Faculty Honor Program Award; HiPEAC Collaboration Grant-FP7 HiPEAC Network of Excellence [287759]; Engineering and Physical Sciences Research Council (EPSRC) [EP/K026399/1, EP/J016284/1]; Engineering and Physical Sciences Research Council [EP/J016284/1, EP/K026399/1]", funding-text = "This work has been supported by the Spanish Ministerio de Economia y Competitividad (MINECO), by FEDER funds through Grant TIN2012-38341-C04-01, by the Intel Early Career Faculty Honor Program Award, by a HiPEAC Collaboration Grant funded by the FP7 HiPEAC Network of Excellence under grant agreement 287759, and by the Engineering and Physical Sciences Research Council (EPSRC) through Grants EP/K026399/1 and EP/J016284/1. Additional data related to this publication are available in the data repository at https://www.repository.cam.ac.uk/handle/1810/249006.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Cache memories; Cache memory; cache storage; cell flip peaks; cell flips; commonly-stored data value; data compression; Degradation; faulty operation; first-level data cache cells; HCI mitigation; Hot carrier effects; Hot Carrier Injection; hot carrier injection; Hot Carrier Injection; hot carriers; Human computer interaction; L1 data cache design; memory architecture; memory cells; microarchitectural techniques; microprocessor chips; microprocessor lifetime; Microprocessors; power aware computing; power reduction; Program processors; threshold voltage degradation; transistor switching; Voltage measurement", number-of-cited-references = "10", oa = "Green Accepted, Green Published", ORCID-numbers = "Valero, Alejandro/0000-0002-0824-5833", research-areas = "Computer Science", times-cited = "0", unique-id = "Valero:2016:ELD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Sen:2016:GFM, author = "Rathijit Sen and David A. Wood", title = "{GPGPU} Footprint Models to Estimate per-Core Power", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "97--100", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2456909", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We explore the problem of how to easily estimate the per-core power distribution of GPGPUs from the total power of all cores. We show that the dynamic energy consumption of a core for a given kernel, represented by its work footprint, is approximately proportional to the total time taken by all work units executing on that core, and the static power, represented by its core footprint, is proportional to the time that the core has assigned work. Footprints can be easily tracked using two hardware counters per GPU core. We also show how per-core power estimates can be used to compute power-performance pareto frontiers that identify opportunities for saving power and energy in cases of non-uniform work distribution by exploiting per-core DVFS support for GPGPUs.", acknowledgement = ack-nhfb, affiliation = "Sen, R (Reprint Author), Univ Wisconsin, Dept Comp Sci, 1210 W Dayton St, Madison, WI 53706 USA. Sen, Rathijit; Wood, David A., Univ Wisconsin, Dept Comp Sci, 1210 W Dayton St, Madison, WI 53706 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "rathijit@cs.wisc.edu david@cs.wisc.edu", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "US National Science Foundation (NSF) [CCF-1218323, CNS-1302260]", funding-text = "The authors thank Srilatha Manne, Indrani Paul, and Wei Huang for discussions about per-core DVFS support in GPUs and Mark Hill, Jason Power, anonymous reviewers, and the Associate Editor for helpful review comments. This work was supported in part with US National Science Foundation (NSF) grants CCF-1218323 and CNS-1302260. The views expressed herein are not necessarily those of the NSF. Wood has significant financial interests in AMD and Google.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Analytical models; Computational modeling; DVFS; dynamic energy consumption; energy consumption; footprint; GPGPU; GPGPU footprint models; GPGPU per-core power distribution; Graphics processing units; graphics processing units; Mathematical model; Pareto analysis; pareto frontier; Pareto optimization; per-core DVFS support; per-core power estimation; power; power aware computing; Power distribution; power-performance Pareto frontiers; Predictive models; static power", keywords-plus = "PERFORMANCE", number-of-cited-references = "12", oa = "Bronze", research-areas = "Computer Science", times-cited = "0", unique-id = "Sen:2016:GFM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Jung:2016:LPS, author = "Daejin Jung and Sheng Li and Jung Ho Ahn", title = "Large Pages on Steroids: Small Ideas to Accelerate Big Memory Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "101--104", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2495103", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Utilizing small (e.g., 4 KB) pages incurs frequent TLB misses on modern big memory applications, substantially degrading the performance of the system. Large (e.g., 1 GB) pages or direct segments can alleviate this penalty due to page table walks, but at the same time such a strategy exposes the organizational and operational details of modern DRAM-based memory systems to applications. Row-buffer conflicts caused by accesses heading to the same DRAM bank but different rows from multiple threads are regarded as the main culprits behind the very large gaps between peak and achieved main memory throughput, but hardware-based approaches in memory controllers have achieved only limited success whereas existing proposals that change memory allocators cannot be applied to large pages or direct segments. In this paper, we propose a set of application-level techniques to improve the effective main memory bandwidth. The techniques stem from the two key observations that (1) each thread of an application exclusively accesses certain datasets for a short or long period of time, and (2) superfluous memory reads originating from a cache's write allocation policy can be avoided if scatters during the data shuffling pass through intermediate cache-friendly buffers. Experiments with a contemporary x86 server show that combining large pages with the proposed address linearization, bank coloring, and write streaming techniques improves the performance of the three big memory applications of high-throughput key-value store, fast-Fourier transform, and radix sort by 37.6, 22.9, and 68.1 percent, respectively.", acknowledgement = ack-nhfb, affiliation = "Jung, D (Reprint Author), Seoul Natl Univ, Dept Transdisciplinary Studies, Seoul, South Korea. Jung, Daejin; Ahn, Jung Ho, Seoul Natl Univ, Dept Transdisciplinary Studies, Seoul, South Korea. Li, Sheng, Intel Labs, Santa Clara, CA USA. Ahn, Jung Ho, Seoul Natl Univ, Big Data Inst, Seoul, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "haidj@snu.ac.kr sheng.r.li@intel.com gajh@snu.ac.kr", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Research Foundation of Korea - Korea government [NRF-2014R1A2A1A11052936, NRF-2012M3A9D1054622]", funding-text = "The authors thank Jongwook Chung and Jaeyoon Choi on their contributions to application writing and experiments. This work was partially supported by the National Research Foundation of Korea grant funded by the Korea government (NRF-2014R1A2A1A11052936 and NRF-2012M3A9D1054622). Jung Ho Ahn is also with Big Data Institute, Seoul National University.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "address linearization; application-level techniques; Bandwidth; bank coloring; big memory applications; cache storage; cache write allocation policy; cache-friendly buffers; data shuffling; DRAM bank; DRAM chips; DRAM-based memory; fast-Fourier transform; high-throughput key-value store; Instruction sets; large pages; memory allocators; memory bandwidth; memory controllers; Memory management; memory throughput; multi-threading; multiple threads; Performance gain; Physical-to-DRAM address mapping; radix sort; Random access memory; row-buffer conflicts; Servers; superfluous memory reads; write streaming", number-of-cited-references = "14", ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394", research-areas = "Computer Science", times-cited = "0", unique-id = "Jung:2016:LPS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Verdu:2016:PSA, author = "Javier Verdu and Alex Pajuelo", title = "Performance Scalability Analysis of {JavaScript} Applications with {Web} Workers", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "105--108", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2494585", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Web applications are getting closer to the performance of native applications taking advantage of new standard-based technologies. The recent HTML5 standard includes, among others, the Web Workers API that allows executing JavaScript applications on multiple threads, or workers. However, the internals of the browser's JavaScript virtual machine does not expose direct relation between workers and running threads in the browser and the utilization of logical cores in the processor. As a result, developers do not know how performance actually scales on different environments and therefore what is the optimal number of workers on parallel JavaScript codes. This paper presents the first performance scalability analysis of parallel web apps with multiple workers. We focus on two case studies representative of different worker execution models. Our analyses show performance scaling on different parallel processor microarchitectures and on three major web browsers in the market. Besides, we study the impact of co-running applications on the web app performance. The results provide insights for future approaches to automatically find out the optimal number of workers that provide the best tradeoff between performance and resource usage to preserve system responsiveness and user experience, especially on environments with unexpected changes on system workload.", acknowledgement = ack-nhfb, affiliation = "Verdu, J (Reprint Author), BarcelonaTECH UPC, Dept Comp Architecture, Barcelona, Spain. Verdu, Javier; Pajuelo, Alex, BarcelonaTECH UPC, Dept Comp Architecture, Barcelona, Spain.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "jverdu@ac.upc.edu mpajuelo@ac.upc.edu", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Spanish Ministry of Economy and Competitiveness (MINECO) [TIN2012-34557]", funding-text = "This work has been supported by the Spanish Ministry of Economy and Competitiveness (MINECO) under contract TIN2012-34557.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "application program interfaces; Benchmark testing; Browsers; Computer architecture; HTML5; HTML5 standard; hypermedia markup languages; Internet; Java; javascript; JavaScript applications; Message systems; Microarchitecture; multithreading; Multithreading; multithreading; online front-ends; parallel processing; parallel processor microarchitectures; parallel Web apps; parallelism; performance scalability analysis; resource usage; Scalability; standard-based technologies; system responsiveness preservation; user experience; Web applications; web apps; Web browsers; web workers; Web workers API; worker execution models", number-of-cited-references = "12", oa = "Green Published", ORCID-numbers = "Pajuelo, Alex/0000-0002-5510-6860 Verdu Mula, Javier/0000-0003-4485-2419", research-areas = "Computer Science", times-cited = "1", unique-id = "Verdu:2016:PSA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Delimitrou:2016:SID, author = "Christina Delimitrou and Christos Kozyrakis", title = "Security Implications of Data Mining in Cloud Scheduling", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "109--112", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2461215", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Cloud providers host an increasing number of popular applications, on the premise of resource flexibility and cost efficiency. Most of these systems expose virtualized resources of different types and sizes. As instances share the same physical host to increase utilization, they contend on hardware resources, e.g., last-level cache, making them vulnerable to side-channel attacks from co-scheduled applications. In this work we show that using data mining techniques can help an adversarial user of the cloud determine the nature and characteristics of co-scheduled applications and negatively impact their performance through targeted contention injections. We design Bolt, a simple runtime that extracts the sensitivity of co-scheduled applications to various types of interference and uses this signal to determine the type of these applications by applying a set of data mining techniques. We validate the accuracy of Bolt on a 39-server cluster. Bolt correctly identifies the type and characteristics of 81 percent out of 108 victim applications, and constructs specialized contention signals that degrade their performance. We also use Bolt to find the most commonly-run applications on EC2. We hope that underlining such security vulnerabilities in modern cloud facilities will encourage cloud providers to introduce stronger resource isolation primitives in their systems.", acknowledgement = ack-nhfb, affiliation = "Delimitrou, C (Reprint Author), Stanford Univ, Dept Elect Engn, Stanford, CA 94305 USA. Delimitrou, Christina; Kozyrakis, Christos, Stanford Univ, Dept Elect Engn, Stanford, CA 94305 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "cdel@stanford.edu kozyraki@stanford.edu", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "39-server cluster; application studies resulting in better multiple-processor systems; Bolt; Cloud computing; cloud computing; cloud facilities; cloud providers; co-scheduled applications; Computer crime; cost efficiency; cryptography; data mining; Data mining; Degradation; Interference; resource allocation; resource flexibility; resource isolation primitives; scheduling and task partitioning; security and privacy protection; security vulnerabilities; Servers; side-channel attacks; specialized contention signals; Super (very large) computers; virtualized resources", number-of-cited-references = "21", research-areas = "Computer Science", times-cited = "0", unique-id = "Delimitrou:2016:SID", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Wang:2016:SMF, author = "Zhenning Wang and Jun Yang and Rami Melhem and Bruce Childers and Youtao Zhang and Minyi Guo", title = "Simultaneous Multikernel: Fine-Grained Sharing of {GPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "113--116", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2477405", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Studies show that non-graphics programs can be less optimized for the GPU hardware, leading to significant resource under-utilization. Sharing the GPU among multiple programs can effectively improve utilization, which is particularly attractive to systems (e.g., cloud computing) where many applications require access to the GPU. However, current GPUs lack proper architecture features to support sharing. Initial attempts are very preliminary in that they either provide only static sharing, which requires recompilation or code transformation, or they do not effectively improve GPU resource utilization. We propose Simultaneous Multikernel (SMK), a fine-grained dynamic sharing mechanism, that fully utilizes resources within a streaming multiprocessor by exploiting heterogeneity of different kernels. We extend the GPU hardware to support SMK, and propose several resource allocation strategies to improve system throughput while maintaining fairness. Our evaluation of 45 shared workloads shows that SMK improves GPU throughput by 34 percent over non-shared execution and 10 percent over a state-of-the-art design.", acknowledgement = ack-nhfb, affiliation = "Wang, ZN (Reprint Author), Shanghai Jiao Tong Univ, Dept Comp Sci, Shanghai, Peoples R China. Wang, Zhenning; Guo, Minyi, Shanghai Jiao Tong Univ, Dept Comp Sci, Shanghai, Peoples R China. Yang, Jun, Univ Pittsburgh, Elect \& Comp Engn Dept, Pittsburgh, PA 15260 USA. Melhem, Rami; Childers, Bruce; Zhang, Youtao, Univ Pittsburgh, Dept Comp Sci, Pittsburgh, PA 15260 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "znwang@sjtu.edu.cn juy9@pitt.edu melhem@cs.pitt.edu childers@cs.pitt.edu zhangyt@cs.pitt.edu guo-my@cs.sjtu.edu.cn", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Basic Research 973 Program of China [2015CB352403]; National Natural Science Foundation of China (NSFC) [61261160502, 61272099]; CSC scholarship; US National Science Foundation (NSF) [CNS-1012070, CNS-1305220, CCF-1422331]", funding-text = "This work is supported in part by the National Basic Research 973 Program of China (No. 2015CB352403), the National Natural Science Foundation of China (NSFC) (Nos. 61261160502, 61272099), the CSC scholarship, US National Science Foundation (NSF) grants CNS-1012070, CNS-1305220, and CCF-1422331.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Context switch; fine-grained dynamic sharing mechanism; GPU; GPU hardware; GPU resource utilization improvement; graphics processing units; Graphics processing units; multiprocessing programs; multiprocessor streaming; multitasking; Multitasking; multitasking; nongraphic programs; resource allocation; Resource management; resource under-utilization; SMK; static sharing; Switches; Throughput", number-of-cited-references = "17", oa = "Bronze", research-areas = "Computer Science", times-cited = "4", unique-id = "Wang:2016:SMF", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Zhang:2016:SIW, author = "Chulian Zhang and Hamed Tabkhi and Gunar Schirner", title = "Studying Inter-Warp Divergence Aware Execution on {GPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "117--120", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2478778", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This letter quantitatively studies the benefits of inter-warp divergence aware execution on GPUs. To that end, the letter first proposes a novel approach to quantify the inter-warp divergence by measuring the temporal similarity in execution progress of concurrent warps, which we call Warp Progression Similarity (WPS). Based on the WPS metric, this letter proposes a WPS-aware Scheduler (WPSaS) to optimize GPU throughput. The aim is to manage inter-warp divergence to hide memory access latency and minimize resource conflicts and temporal under-utilization in compute units allowing GPUs to achieve their peak throughput. Our results demonstrate that WPSaS improves throughput by 10 percent with a pronounced reduction in resource conflicts and temporal under-utilization.", acknowledgement = ack-nhfb, affiliation = "Zhang, CL (Reprint Author), Northeastern Univ, Dept Elect \& Comp Engn, Boston, MA 02115 USA. Zhang, Chulian; Tabkhi, Hamed; Schirner, Gunar, Northeastern Univ, Dept Elect \& Comp Engn, Boston, MA 02115 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "zhang.chul@husky.neu.edu tabkhi@ece.neu.edu schirner@ece.neu.edu", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Science Foundation [1319501]", funding-text = "This material is based upon work supported by the National Science Foundation under Award No. 1319501.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; Computer architecture; concurrent warps; GPU scheduler; GPU throughput optimization; Graphics processing units; graphics processing units; Histograms; Inter-warp divergence; interwarp divergence aware execution; interwarp divergence management; Measurement; memory access latency hiding; Processor scheduling; resource allocation; resource conflict minimization; scheduling; temporal similarity measurement; temporal underutilization; Throughput; warp progression similarity; warp progression similarity (WPS); WPS metric; WPS-aware scheduler; WPSaS", number-of-cited-references = "8", oa = "Bronze", research-areas = "Computer Science", times-cited = "1", unique-id = "Zhang:2016:SIW", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Tavakkol:2016:TTB, author = "Arash Tavakkol and Pooyan Mehrvarzy and Hamid Sarbazi-Azad", title = "{TBM}: Twin Block Management Policy to Enhance the Utilization of Plane-Level Parallelism in {SSDs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "121--124", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2461162", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The internal architecture of a SSD provides channel-, chip-, die- and plane-level parallelism levels, to concurrently perform multiple data accesses and compensate for the performance gap between a single flash chip and host interface. Although a good striping strategy can effectively exploit the first three levels, parallel I/O accesses at plane-level can be performed only for operations of the same types and page addresses. In this work, we propose the Twin Block Management (TBM) policy that symmetrically conducts usage and recycling of the flash block addresses on the planes of a die, thus enhancing the utilization of plane-level parallelism for reads, writes and erases. Evaluation results show that TBM improves IOPS and response time by up to 73 and 42 percent, respectively.", acknowledgement = ack-nhfb, affiliation = "Tavakkol, A (Reprint Author), Sharif Univ Technol, Dept Comp Engn, HPCAN Lab, Tehran, Iran. Tavakkol, Arash; Sarbazi-Azad, Hamid, Sharif Univ Technol, Dept Comp Engn, HPCAN Lab, Tehran, Iran. Mehrvarzy, Pooyan; Sarbazi-Azad, Hamid, Inst Res Fundamental Sci IPM, Sch Comp Sci, Tehran, Iran.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "tavakkol@ce.sharif.edu p.mehrvarzy@ipm.ir azad@ipm.ir", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "channel-level parallelism level; chip-level parallelism level; die-level parallelism level; flash block; flash chip; flash memories; Flash memory; garbage collection; host interface; IOPS; memory architecture; multiple data accesses; parallel processing; Parallel processing; performance evaluation; plane-level parallelism; plane-level parallelism level; Recycling; Resource management; response time; Solid state circuits; solid-state drive; SSD internal architecture; TBM; Time factors; twin block management", number-of-cited-references = "11", ORCID-numbers = "Tavakkol, Arash/0000-0003-3859-1259", research-areas = "Computer Science", times-cited = "0", unique-id = "Tavakkol:2016:TTB", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Jacob:2016:PPT, author = "Bruce Jacob", title = "The 2 {PetaFLOP}, 3 Petabyte, 9 {TB/s}, 90 {kW} Cabinet: a System Architecture for Exascale and Big Data", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "125--128", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2451652", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We present a system architecture that uses high-efficiency processors as opposed to high-performance processors, NAND flash as byte-addressable main memory, and high-speed DRAM as a cache front-end for the flash. The main memory system is interconnected and presents a unified global address space to the client microprocessors. A single cabinet contains 2,550 nodes, networked in a highly redundant modified Moore graph that yields a bisection bandwidth of 9.1 TB/s and a worst-case latency of four hops from any node to any other. At a per-cabinet level, the system supports a minimum of 2.6 petabytes of main memory, dissipates 90 kW, and achieves 2.2 PetaFLOPS. The system architecture provides several features desirable in today's large-scale systems, including a global shared physical address space (and optional support for a global shared virtual space as well), the ability to partition the physical space unequally among clients as in a unified cache architecture (e.g., so as to support multiple VMs in a datacenter), pairwise system-wide sequential consistency on user-specified address sets, built-in checkpointing via journaled non-volatile main memory, memory cost-per-bit approaching that of NAND flash, and memory performance approaching that of pure DRAM.", acknowledgement = ack-nhfb, affiliation = "Jacob, B (Reprint Author), Univ Maryland, Elect \& Comp Engn, College Pk, MD 20742 USA. Jacob, Bruce, Univ Maryland, Elect \& Comp Engn, College Pk, MD 20742 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "blj@umd.edu", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Big Data; Big data; Big Data; bisection bandwidth; built-in checkpointing; byte-addressable main memory; cache storage; checkpointing; DRAM chips; exascale computing; extremely large; extremely large, high radix network topologies; flash memories; High performance computing; high-efficiency processors high-performance processors; High-performance computing; high-radix network topologies; high-speed DRAM; journaled main memory; memory architecture; Memory management; memory performance; microprocessor chips; microprocessors; NAND flash; Network topology; nonvolatile main memory; pairwise system-wide sequential consistency; parallel architectures; PetaFLOP; Ports (Computers); Program processors; Random access memory; redundant modified Moore graph; system architecture; user-specified address sets", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "1", unique-id = "Jacob:2016:PPT", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Xiao:2016:TAC, author = "He Xiao and Wen Yueh and Saibal Mukhopadhyay and Sudhakar Yalamanchili", title = "Thermally Adaptive Cache Access Mechanisms for {3D} Many-Core Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "129--132", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2495125", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "A compelling confluence of technology and application trends in which the cost, execution time, and energy of applications are being dominated by the memory system is driving the industry to 3D packages for future microarchitectures. However, these packages result in high heat fluxes and increased thermal coupling challenging current thermal solutions. Conventional design approaches utilize design margins that correspond to worst case temperatures and process corners leading to a significant impact on system level performance. This paper advocates a design approach based on microarchitecture adaptation to device-level temperature-dependent delay variations to realize average case performance that is superior to which can be achieved by using worst case design margins. We demonstrate this approach with adaptation principles for the last level cache (LLC) in a 3D many-core architecture. We propose and evaluate two adaptation mechanisms. In the first case, the access time to the LLC from the L1 tracks the LLC's temperature-delay variations. In the second case, the processor DVFS state tracks the LLC temperature as a negative feedback. Compared to a worst case design baseline, the full system simulation results show that both approaches increase the IPC by over 20 percent, and improve the energy efficiency by up to 3 percent.", acknowledgement = ack-nhfb, affiliation = "Xiao, H (Reprint Author), Georgia Inst Technol, Sch Elect \& Comp Engn, Atlanta, GA 30332 USA. Xiao, He; Yueh, Wen; Mukhopadhyay, Saibal; Yalamanchili, Sudhakar, Georgia Inst Technol, Sch Elect \& Comp Engn, Atlanta, GA 30332 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "hxiao@gatech.edu wyueh3@gatech.edu saibal.mukhopadhyay@ece.gatech.edu sudha.yalamanchili@ece.gatech.edu", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Semiconductor Research Corporation under SRC [2318.001]; National Science Foundation [CNS-0855110]", funding-text = "This research is supported and sponsored by the Semiconductor Research Corporation under SRC task 2318.001, and the National Science Foundation under grant CNS-0855110.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "3D IC; 3D IC, SRAM cache, adaptive architecture, performance gain, energy efficiency; 3D many-core architectures; Adaptation models; adaptive architecture; Cache memory; cache storage; Computer architecture; device-level temperature-dependent delay variations; energy efficiency; integrated circuit design; Integrated circuit modeling; last level cache; LLC temperature; memory architecture; Microarchitecture; microarchitecture adaptation; microarchitectures; multiprocessing systems; performance evaluation; performance gain; power aware computing; processor DVFS state; Random access memory; SRAM cache; system level performance; thermal coupling challenging current thermal solutions; thermally adaptive cache access mechanisms; Three-dimensional displays", number-of-cited-references = "13", oa = "Bronze", research-areas = "Computer Science", times-cited = "0", unique-id = "Xiao:2016:TAC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Hu:2016:TDM, author = "Qi Hu and Peng Liu and Michael C. Huang", title = "Threads and Data Mapping: Affinity Analysis for Traffic Reduction", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "133--136", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2451172", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Modern processors spend significant amount of time and energy moving data. With the increase in core count, the relative importance of such latency and energy expenditure will only increase with time. Inter-core communication traffic when executing a multithreaded application is one such source of latency and energy expenditure. This traffic is influenced by the mapping of threads and data onto multicore systems. This paper investigates the impact of threads and data mapping on traffic in a chip-multiprocessor, and exploits the potential for traffic reduction through threads and data mapping. Based on the analysis and estimation of the lowest traffic, we propose a threads and data mapping mechanism to approach the lowest traffic. The mapping takes both the correlation among threads and the affinity of data with individual threads into account, and results in significant traffic reduction and energy savings.", acknowledgement = ack-nhfb, affiliation = "Liu, P (Reprint Author), Zhejiang Univ, Coll Informat Sci \& Elect Engn, Hangzhou 310027, Peoples R China. Hu, Qi; Liu, Peng, Zhejiang Univ, Coll Informat Sci \& Elect Engn, Hangzhou 310027, Peoples R China. Huang, Michael C., Univ Rochester, Dept Elect \& Comp Engn, 601 Elmwood Ave, Rochester, NY 14627 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "huqi\_isee@zju.edu.cn liupeng@zju.edu.cn michael.huang@rochester.edu", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSFC [61028004]; US National Science Foundation (NSF) [1217662, 1255729]; Open Project Program of the State Key Laboratory of Mathematical Engineering and Advanced Computing [2014A08, 2015A09]", funding-text = "This work was supported by NSFC under grant 61028004, and also in part by US National Science Foundation (NSF) under grants 1217662 and 1255729, and the Open Project Program of the State Key Laboratory of Mathematical Engineering and Advanced Computing under grants 2014A08 and 2015A09. P. Liu is the corresponding author.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "affinity analysis; chip-multiprocessor; Correlation; data mapping; energy conservation; energy savings; Instruction sets; intercore communication traffic; Mapping; memory; Message systems; microprocessor chips; modern processors; multi-threading; multicore; Multicore processing; multicore systems; multiprocessing systems; multithreaded application; network-on-chip; Network-on-chip; network-on-chip; Statistical analysis; thread mapping; traffic; traffic reduction", keywords-plus = "NETWORKS; CACHES; CHIP", number-of-cited-references = "11", oa = "Bronze", research-areas = "Computer Science", times-cited = "0", unique-id = "Hu:2016:TDM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2016:TCb, author = "Anonymous", title = "Table of Contents", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "C1--C1", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2628298", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2016:Ce, author = "Anonymous", title = "Cover", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "C2--C2", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2628299", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2016:Cf, author = "Anonymous", title = "Cover", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "C3--C3", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2628301", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2016:TCBa, author = "Anonymous", title = "Table of contents [back cover]", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "C4--C4", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2628302", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Beckmann:2017:CCM, author = "Nathan Beckmann and Daniel Sanchez", title = "Cache Calculus: Modeling Caches through Differential Equations", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "1--5", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2512873", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Caches are critical to performance, yet their behavior is hard to understand and model. In particular, prior work does not provide closed-form solutions of cache performance, i.e., simple expressions for the miss rate of a specific access pattern. Existing cache models instead use numerical methods that, unlike closed-form solutions, are computationally expensive and yield limited insight. We present cache calculus, a technique that models cache behavior as a system of ordinary differential equations, letting standard calculus techniques find simple and accurate solutions of cache performance for common access patterns.", acknowledgement = ack-nhfb, affiliation = "Beckmann, N (Reprint Author), MIT CSAIL, Cambridge, MA 02139 USA. Beckmann, Nathan; Sanchez, Daniel, MIT CSAIL, Cambridge, MA 02139 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "beckmann@csail.mit.edu sanchez@csail.mit.edu", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [CCF-1318384]; Qatar Computing Research Institute", funding-text = "This work was supported in part by NSF grant CCF-1318384 and a grant from the Qatar Computing Research Institute.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Arrays; cache behavior models; cache calculus; cache memory; cache storage; closed-form solutions; Closed-form solutions; closed-form solutions; Computational modeling; Computer architecture; computer architecture; Computer architecture; differential equations; Differential equations; differential equations; mathematical model; Mathematical model; miss rate; Numerical models; ordinary differential equations", number-of-cited-references = "8", oa = "Bronze", research-areas = "Computer Science", times-cited = "1", unique-id = "Beckmann:2017:CCM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2017:IIC, author = "Anonymous", title = "2016 Index {{\booktitle{IEEE Computer Architecture Letters}}} Vol. 15", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "1--6", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2653771", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Zhan:2017:CCS, author = "Xin Zhan and Reza Azimi and Svilen Kanev and David Brooks and Sherief Reda", title = "{CARB}: a {C}-State Power Management Arbiter for Latency-Critical Workloads", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "6--9", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2537802", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Latency-critical workloads in datacenters have tight response time requirements to meet service-level agreements (SLAs). Sleep states (c-states) enable servers to reduce their power consumption during idle times; however entering and exiting c-states is not instantaneous, leading to increased transaction latency. In this paper we propose a c-state arbitration technique, CARB, that minimizes response time, while simultaneously realizing the power savings that could be achieved from enabling c-states. CARB adapts to incoming request rates and processing times and activates the smallest number of cores for processing the current load. CARB reshapes the distribution of c-states and minimizes the latency cost of sleep by avoiding going into deep sleeps too often. We quantify the improvements from CARB with memcached running on an 8-core Haswell-based server.", acknowledgement = ack-nhfb, affiliation = "Zhan, X (Reprint Author), Brown Univ, Providence, RI 02906 USA. Zhan, Xin; Azimi, Reza; Reda, Sherief, Brown Univ, Providence, RI 02906 USA. Kanev, Svilen; Brooks, David, Harvard Univ, Cambridge, MA 02138 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "xin\_zhan@brown.edu reza\_azimi@brown.edu skanev@eecs.harvard.edu dbrooks@eecs.harvard.edu sherief\_reda@brown.edu", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [1305148, 1438958]", funding-text = "The authors would like to thank the anonymous reviewers for their comments. The research of X. Zhan, R. Azimi, and S. Reda was supported by NSF under Grants 1305148 and 1438958.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "c-state; c-state arbitration technique; c-state distribution; c-state power management arbiter; cache storage; CARB; computer centres; contracts; datacenters; Delays; energy-efficient; feedback controller; Haswell-based server; idle times; latency cost minimization; Latency-critical workloads; latency-critical workloads; memcached; Monitoring; Optimization; power aware computing; power consumption; Power demand; power savings; processing times; request rates; response time minimization; Servers; service-level agreements; SLA; sleep states; Time factors; workload consolidation", number-of-cited-references = "10", oa = "Bronze", research-areas = "Computer Science", times-cited = "1", unique-id = "Zhan:2017:CCS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Jeon:2017:CCA, author = "Dong-Ik Jeon and Ki-Seok Chung", title = "{CasHMC}: a Cycle-Accurate Simulator for Hybrid Memory Cube", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "10--13", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2600601", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "3D-stacked DRAM has been actively studied to overcome the limits of conventional DRAM. The Hybrid Memory Cube (HMC) is a type of 3D-stacked DRAM that has drawn great attention because of its usability for server systems and processing-in-memory (PIM) architecture. Since HMC is not directly stacked on the processor die where the central processing units (CPUs) and graphic processing units (GPUs) are integrated, HMC has to be linked to other processor components through high speed serial links. Therefore, the communication bandwidth and latency should be carefully estimated to evaluate the performance of HMC. However, most existing HMC simulators employ only simple HMC modeling. In this paper, we propose a cycle-accurate simulator for hybrid memory cube called CasHMC. It provides a cycle-by-cycle simulation of every module in an HMC and generates analysis results including a bandwidth graph and statistical data. Furthermore, CasHMC is implemented in C++ as a single wrapped object that includes an HMC controller, communication links, and HMC memory. Instantiating this single wrapped object facilitates simultaneous simulation in parallel with other simulators that generate memory access patterns such as a processor simulator or a memory trace generator.", acknowledgement = ack-nhfb, affiliation = "Jeon, DI (Reprint Author), Hanyang Univ, Dept Elect \& Comp Engn, Seoul 04763, South Korea. Jeon, Dong-Ik; Chung, Ki-Seok, Hanyang Univ, Dept Elect \& Comp Engn, Seoul 04763, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "estwingz@naver.com kchung@hanyang.ac.kr", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Institute for Information \& communications Technology Promotion (IITP) --- Korea government (MSIP) [R7119-16-1009]", funding-text = "This work was supported by Institute for Information \& communications Technology Promotion (IITP) grant funded by the Korea government (MSIP) (R7119-16-1009, Development of Intelligent Semiconductor Core Technologies for IoT Devices based on Harvest Energy). Ki-Seok Chung is the corresponding author.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "3D-stacked DRAM; Analytical models; Bandwidth; bandwidth graph; Benchmark testing; CasHMC; central processing units; communication bandwidth; communication links; Computational modeling; Computer architecture; CPU; cycle-accurate simulator; cycle-by-cycle simulation; C{\thorn}{\thorn}; DRAM chips; GPU; graph theory; graphic processing units; high-speed serial links; HMC controller; HMC memory; HMC simulators; hybrid memory cube; latency; memory access patterns; memory architecture; Memory control and access; memory design; memory trace generator; modeling of computer architecture; performance evaluation; PIM architecture; processing-in-memory architecture; processor simulator; Random access memory; server systems; simulation; Simulation; simulation; single-wrapped object instantiation; statistical analysis; statistical data", number-of-cited-references = "10", ORCID-numbers = "CHUNG, KI-SEOK/0000-0002-2908-8443 Jeon, Dong-Ik/0000-0002-8572-4184", research-areas = "Computer Science", times-cited = "6", unique-id = "Jeon:2017:CCA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Wu:2017:CSB, author = "Hao Wu and Fangfei Liu and Ruby B. Lee", title = "Cloud Server Benchmark Suite for Evaluating New Hardware Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "14--17", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2597818", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Adding new hardware features to a cloud computing server requires testing both the functionality and the performance of the new hardware mechanisms. However, commonly used cloud computing server workloads are not well-represented by the SPEC integer and floating-point benchmark and Parsec suites typically used by the computer architecture community. Existing cloud benchmark suites for scale-out or scale-up computing are not representative of the most common cloud usage, and are very difficult to run on a cycle-accurate simulator that can accurately model new hardware, like gem5. In this paper, we present PALMScloud, a suite of cloud computing benchmarks for performance evaluation of cloud servers, that is ready to run on the gem5 cycle-accurate simulator. We conduct a behavior characterization and analysis of the benchmarks. We hope that these cloud benchmarks, ready to run on a dual-machine gem5 simulator or on real machines, can be useful to other researchers interested in improving hardware micro-architecture and cloud server performance.", acknowledgement = ack-nhfb, affiliation = "Wu, H (Reprint Author), Princeton Univ, Princeton, NJ 08544 USA. Wu, Hao; Liu, Fangfei; Lee, Ruby B., Princeton Univ, Princeton, NJ 08544 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "haow.princeton@gmail.com fangfeil@princeton.edu rblee@princeton.edu", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "DHS/AFRL [FA8750-12-2-0295]; National Science Foundation [CNS-1218817]", funding-text = "This work was supported in part by DHS/AFRL FA8750-12-2-0295 and US National Science Foundation CNS-1218817.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "behavior characterization; Benchmark testing; benchmarks; Cloud Computing; Cloud computing; cloud computing; cloud computing benchmarks; cloud computing server workloads; cloud server benchmark; cloud servers; cloud usage; computer architecture; computer architecture community; cycle accurate simulator; dual machine gem5 simulator; floating-point benchmark; gem5; Hardware; new hardware architectures; new hardware mechanisms; Parsec; performance evaluation; Performance evaluation; scale-out computing; scale-up computing; simulation; SPEC integer", number-of-cited-references = "8", oa = "Bronze", research-areas = "Computer Science", times-cited = "0", unique-id = "Wu:2017:CSB", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Seyedzadeh:2017:CBT, author = "Seyed Mohammad Seyedzadeh and Alex K. Jones and Rami Melhem", title = "Counter-Based Tree Structure for Row Hammering Mitigation in {DRAM}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "18--21", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2614497", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Scaling down DRAM technology degrades cell reliability due to increased coupling between adjacent DRAM cells, commonly referred to as crosstalk. Moreover, high access frequency of certain cells (hot cells) may cause data loss in neighboring cells in adjacent rows due to crosstalk, which is known as row hammering. In this work, the goal is to mitigate row hammering in DRAM cells through a Counter-Based Tree (CBT) approach. This approach uses a tree of counters to detect hot rows and then refreshes neighboring cells. In contrast to existing deterministic solutions, CBT utilizes fewer counters that makes it practically feasible to be implemented on-chip. Compared to existing probabilistic approaches, CBT more precisely refreshes rows vulnerable to row hammering based on their access frequency. Experimental results on workloads from three benchmark suites show that CBT can reduce the refresh energy by more than 60 percent and nearly 70 percent in comparison to leading probabilistic and deterministic approaches, respectively. Furthermore, hardware evaluation shows that CBT can be easily implemented on-chip with only a nominal overhead.", acknowledgement = ack-nhfb, affiliation = "Seyedzadeh, SM (Reprint Author), Univ Pittsburgh, Dept Comp Sci, Pittsburgh, PA 15260 USA. Seyedzadeh, Seyed Mohammad; Melhem, Rami, Univ Pittsburgh, Dept Comp Sci, Pittsburgh, PA 15260 USA. Jones, Alex K., Univ Pittsburgh, Dept Elect \& Comp Engn, Pittsburgh, PA 15260 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "seyedzadeh@cs.pitt.edu akjones@pitt.edu melhem@cs.pitt.edu", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [CCF-1064976]; SGMI grant from Samsung electronics", funding-text = "This work is supported by NSF grants CCF-1064976 and an SGMI grant from Samsung electronics. We thank the anonymous reviewers for their feedback.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "CBT; Computer architecture; counter-based tree structure; crosstalk; Crosstalk; crosstalk; DRAM; DRAM chips; dynamic random-access memory; Microprocessors; Radiation detectors; Random access memory; reliability; Reliability; reliability; row hammering mitigation; System-on-chip", keywords-plus = "REFRESH; MEMORY", number-of-cited-references = "17", oa = "Bronze", research-areas = "Computer Science", times-cited = "3", unique-id = "Seyedzadeh:2017:CBT", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Naghibijouybari:2017:CCG, author = "Hoda Naghibijouybari and Nael Abu-Ghazaleh", title = "Covert Channels on {GPGPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "22--25", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2590549", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "GPUs are increasingly used to accelerate the performance of not only graphics workloads, but also data intensive applications. In this paper, we explore the feasibility of covert channels in General Purpose Graphics Processing Units (GPGPUs). We consider the possibility of two colluding malicious applications using the GPGPU as a covert channel to communicate, in the absence of a direct channel between them. Such a situation may arise in cloud environments, or in environments employing containment mechanisms such as dynamic information flow tracking. We reverse engineer the block placement algorithm to understand co-residency of blocks from different applications on the same Streaming Multiprocessor (SM) core, or on different SMs concurrently. In either mode, we identify the shared resources that may be used to create contention. We demonstrate the bandwidth of two example channels: one that uses the L1 constant memory cache to enable communication on the same SM, and another that uses the L2 constant memory caches to enable communication between different SMs. We also examine the possibility of increasing the bandwidth of the channel by using the available parallelism on the GPU, achieving a bandwidth of over 400 Kbps. This study demonstrates that GPGPUs are a feasible medium for covert communication.", acknowledgement = ack-nhfb, affiliation = "Naghibijouybari, H (Reprint Author), Univ Calif Riverside, Dept Comp Sci \& Engn, Riverside, CA 92521 USA. Naghibijouybari, Hoda; Abu-Ghazaleh, Nael, Univ Calif Riverside, Dept Comp Sci \& Engn, Riverside, CA 92521 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "hnagh001@ucr.edu naelag@ucr.edu", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "US National Science Foundation [CNS-1422401]", funding-text = "This work is partially supported by US National Science Foundation grant CNS-1422401.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Acceleration; Bandwidth; block placement algorithm; cache storage; Computer architecture; covert channel; general purpose graphics processing units; GPGPU; Graphics processing units; graphics processing units; Kernel; L1 constant memory cache; L2 constant memory caches; malicious applications; multiprocessing systems; Security; security of data; SM core; streaming multiprocessor core; Trojan horses", number-of-cited-references = "23", oa = "Bronze", research-areas = "Computer Science", times-cited = "2", unique-id = "Naghibijouybari:2017:CCG", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Song:2017:EPU, author = "Wonjun Song and Hyung-Joon Jung and Jung Ho Ahn and Jae W. Lee and John Kim", title = "Evaluation of Performance Unfairness in {NUMA} System Architecture", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "26--29", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2602876", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "NUMA (Non-uniform memory access) system architectures are commonly used in high-performance computing and datacenters. Within each architecture, a processor-interconnect is used for communication between the different sockets and examples of such interconnect include Intel QPI and AMD HyperTransport. In this work, we explore the impact of the processor-interconnect on overall performance-in particular, we explore the impact on performance fairness from the processor-interconnect arbitration. It is well known that locally-fair arbitration does not guarantee globally-fair bandwidth sharing as closer nodes receive more bandwidth in a multi-hop network. However, this paper is the first to demonstrate the opposite can occur in a commodity NUMA servers where remote nodes receive higher bandwidth (and perform better). This problem occurs because router micro-architectures for processor-interconnects commonly employ external concentration. While accessing remote memory can occur in any NUMA system, performance unfairness (or performance variation) is more critical in cloud computing and virtual machines with shared resources. We demonstrate how this unfairness creates significant performance variation when executing workload on the Xen virtualization platform. We then provide analysis using synthetic workloads to better understand the source of unfairness.", acknowledgement = ack-nhfb, affiliation = "Song, W (Reprint Author), Korea Adv Inst Sci \& Technol, Daejeon, South Korea. Song, Wonjun; Jung, Hyung-Joon; Kim, John, Korea Adv Inst Sci \& Technol, Daejeon, South Korea. Ahn, Jung Ho; Lee, Jae W., Seoul Natl Univ, Seoul, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "iamwonjunsong@kaist.edu hans7taiji@kaist.edu gajh@snu.ac.kr jaewlee@snu.ac.kr jjk12@kaist.edu", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Google Faculty Research Award, National Research Foundation of Korea [NRF-2013R1A2A2A01069132, NRF-2014R1A2A1A11052936, NRF-2015M3C4A7065647]; MSIP under the ITRC [IITP-2016-H8501-16-1005]", funding-text = "This work was supported in part by Google Faculty Research Award, National Research Foundation of Korea (NRF-2013R1A2A2A01069132, NRF-2014R1A2A1A11052936, and NRF-2015M3C4A7065647), and in part by MSIP under the ITRC (IITP-2016-H8501-16-1005).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "AMD HyperTransport; Bandwidth; cloud computing; globally-fair bandwidth sharing; high-performance computing; Intel QPI; locally-fair arbitration; memory architecture; Micromechanical devices; multihop network; Multiprocessor interconnection; nonuniform memory access system architectures; NUMA; NUMA system architecture; parallel processing; performance unfairness evaluation; processor-interconnect; processor-interconnect arbitration; processor-interconnects; router microarchitectures; Servers; shared resources; Sockets; System-on-chip; unfairness; virtual machines; Virtual machining; Xen virtualization platform", number-of-cited-references = "8", research-areas = "Computer Science", researcherid-numbers = "Kim, John/C-1792-2011", times-cited = "1", unique-id = "Song:2017:EPU", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Verner:2017:EAL, author = "Uri Verner and Avi Mendelson and Assaf Schuster", title = "Extending {Amdahl's Law} for Multicores with Turbo Boost", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "30--33", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2512982", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Rewriting sequential programs to make use of multiple cores requires considerable effort. For many years, Amdahl's law has served as a guideline to assess the performance benefits of parallel programs over sequential ones, but recent advances in multicore design introduced variability in the performance of the cores and motivated the reexamination of the underlying model. This paper extends Amdahl's law for multicore processors with built-in dynamic frequency scaling mechanisms such as Intel's Turbo Boost. Using a model that captures performance dependencies between cores, we present tighter upper bounds for the speedup and reduction in energy consumption of a parallel program over a sequential one on a given multicore processor and validate them on Haswell and Sandy Bridge Intel CPUs. Previous studies have shown that from a processor design perspective, Turbo Boost mitigates the speedup limitations obtained under Amdahl's law by providing higher performance for the same energy budget. However, our new model and evaluation show that from a software development perspective, Turbo Boost aggravates these limitations by making parallelization of sequential codes less profitable.", acknowledgement = ack-nhfb, affiliation = "Verner, U (Reprint Author), Technion, Dept Comp Sci, Haifa, Israel. Verner, Uri; Mendelson, Avi; Schuster, Assaf, Technion, Dept Comp Sci, Haifa, Israel.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "uriv@cs.technion.ac.il avi.mendelson@cs.technion.ac.il assaf@cs.technion.ac.il", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Amdahl law; Amdahl's law; Bridges; code parallelization; Computational modeling; dynamic frequency scaling mechanisms; energy consumption; Energy consumption; energy consumption; Haswell; multicore; multicore design; Multicore processing; multicore processors; multiple cores; multiprocessing systems; parallel programming; parallel programs; Performance modeling; Power demand; Program processors; Sandy Bridge Intel CPU; sequential code parallelization; sequential program rewriting; software development perspective; software engineering; Time measurement; turbo boost; Turbo Boost; turbo boost", number-of-cited-references = "12", research-areas = "Computer Science", times-cited = "0", unique-id = "Verner:2017:EAL", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Sasaki:2017:HTP, author = "Hiroshi Sasaki and Fang-Hsiang Su and Teruo Tanimoto and Simha Sethumadhavan", title = "Heavy Tails in Program Structure", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "34--37", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2574350", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Designing and optimizing computer systems require deep understanding of the underlying system behavior. Historically many important observations that led to the development of essential hardware and software optimizations were driven by empirical observations about program behavior. In this paper, we report an interesting property of program structures by viewing dynamic program execution as a changing network. By analyzing the communication network created as a result of dynamic program execution, we find that communication patterns follow heavy-tailed distributions. In other words, a few instructions have consumers that are orders of magnitude larger than most instructions in a program. Surprisingly, these heavy-tailed distributions follow the iconic power law previously seen in man-made and natural networks. We provide empirical measurements based on the SPEC CPU2006 benchmarks to validate our findings as well as perform semantic analysis of the source code to reveal the causes of such behavior.", acknowledgement = ack-nhfb, affiliation = "Sasaki, H (Reprint Author), Columbia Univ, Dept Comp Sci, New York, NY 10027 USA. Sasaki, Hiroshi; Su, Fang-Hsiang; Sethumadhavan, Simha, Columbia Univ, Dept Comp Sci, New York, NY 10027 USA. Tanimoto, Teruo, Kyushu Univ, Grad Sch Informat Sci \& Elect Engn, Fukuoka 8190395, Japan.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "sasaki@cs.columbia.edu mikefhsu@cs.columbia.edu teruo.tanimoto@cpc.ait.kyushu-u.ac.jp simha@cs.columbia.edu", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "JSPS Postdoctoral Fellowships for Research Abroad; US National Science Foundation [1302269]; Alfred P. Sloan Fellowship", funding-text = "This work is sponsored in part by JSPS Postdoctoral Fellowships for Research Abroad, US National Science Foundation award number 1302269 and Alfred P. Sloan Fellowship. This work was done while Teruo Tanimoto was a visiting student at Columbia University.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; Communication networks; computer systems; Computers; dynamic program execution; empirical studies; hardware optimization; heavy-tailed distribution; Image edge detection; Optimization; Program characterization; program diagnostics; program structure; Registers; semantic analysis; Shape; software optimization; SPEC CPU2006 benchmarks; statistical distribution; statistical distributions; system behavior", number-of-cited-references = "9", oa = "Bronze", research-areas = "Computer Science", researcherid-numbers = "Sasaki, Hiroshi/N-8579-2019", times-cited = "1", unique-id = "Sasaki:2017:HTP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Feng:2017:HHC, author = "Liang Feng and Hao Liang and Sharad Sinha and Wei Zhang", title = "{HeteroSim}: a Heterogeneous {CPU--FPGA} Simulator", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "38--41", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2615617", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Heterogeneous Computing is a promising direction to address the challenges of performance and power walls in high-performance computing, where CPU-FPGA architectures are particularly promising for application acceleration. However, the development of such architectures associated with optimal memory hierarchies is challenging due to the absence of an integrated simulator to support full system simulation and architectural exploration. In this work, we present HeteroSim, a full system simulator supporting x86 multi-cores integrated with an FPGA via bus connection. It can support fast architectural exploration with respect to number of cores, number of accelerated kernels on FPGA, and different memory hierarchies between CPU and FPGA. Various performance metrics are returned for further performance analysis and architectural configuration optimization.", acknowledgement = ack-nhfb, affiliation = "Feng, L (Reprint Author), Hong Kong Univ Sci \& Technol, Kowloon, Hong Kong, Peoples R China. Feng, Liang; Liang, Hao; Sinha, Sharad; Zhang, Wei, Hong Kong Univ Sci \& Technol, Kowloon, Hong Kong, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "lfengad@connect.ust.hk hliangac@connect.ust.hk sharad\_sinha@ieee.org wei.zhang@ust.hk", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Acceleration; architectural configuration optimization; bus connection; Computational modeling; Computer architecture; CPU-FPGA architectures; digital simulation; Field programmable gate arrays; field programmable gate arrays; FPGA; full system simulator; Hardware design languages; heterogeneous computing; heterogeneous CPU-FPGA simulator; heterogeneous system; HeteroSim; high-performance computing; Kernel; microprocessor chips; multiprocessing systems; optimal memory hierarchies; parallel architectures; performance analysis; performance metrics; Registers; Simulator; x86 multicores", number-of-cited-references = "11", ORCID-numbers = "SINHA, SHARAD/0000-0002-4532-2017 SINHA, SHARAD/0000-0002-4532-2017", research-areas = "Computer Science", researcherid-numbers = "SINHA, SHARAD/J-6775-2019 SINHA, SHARAD/R-2575-2017", times-cited = "1", unique-id = "Feng:2017:HHC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Zhao:2017:LIC, author = "Xia Zhao and Yuxi Liu and Almutaz Adileh and Lieven Eeckhout", title = "{LA-LLC}: Inter-Core Locality-Aware Last-Level Cache to Exploit Many-to-Many Traffic in {GPGPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "42--45", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2611663", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The reply network is a severe performance bottleneck in General Purpose Graphic Processing Units (GPGPUs), as the communication path from memory controllers (MC) to cores is often congested. In this paper, we find that instead of relying on the congested communication path between MCs and cores, the unused core-to-core communication path can be leveraged to transfer data blocks between cores. We propose the inter-core Locality-Aware Last-Level Cache (LA-LLC), which requires only few bits per cache block and enables a core to fetch shared data from another core's private cache instead of the LLC. Leveraging inter-core communication, LA-LLC transforms few-to-many traffic to many-to-many traffic, thereby mitigating the reply network bottleneck. For a set of applications exhibiting varying degrees of inter-core locality, LA-LLC reduces memory access latency and increases performance by 21.1 percent on average and up to 68 percent, with negligible hardware cost.", acknowledgement = ack-nhfb, affiliation = "Zhao, X (Reprint Author), Univ Ghent, Ghent, Belgium. Zhao, Xia; Liu, Yuxi; Adileh, Almutaz; Eeckhout, Lieven, Univ Ghent, Ghent, Belgium.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "xia.zhao@ugent.be yuxi.liu@ugent.be almutaz.adileh@ugent.be lieven.eeckhout@ugent.be", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Benchmark testing; cache storage; congested communication path; core-to-core communication path; few-to-many traffic; general purpose graphic processing units; GPGPU; GPGPUs; Graphics processing units; graphics processing units; inter-core locality; intercore communication; intercore locality-aware last-level cache; LA-LLC; LLC; many-to-many traffic; memory access latency; memory controllers; Multiprocessor interconnection; network-on-chip; NoC; Ports (Computers); private cache; reply network; shared data fetching; System recovery", number-of-cited-references = "16", oa = "Green Published", ORCID-numbers = "Zhao, Xia/0000-0001-6479-9200", research-areas = "Computer Science", times-cited = "0", unique-id = "Zhao:2017:LIC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Boroumand:2017:LEC, author = "Amirali Boroumand and Saugata Ghose and Minesh Patel and Hasan Hassan and Brandon Lucia and Kevin Hsieh and Krishna T. Malladi and Hongzhong Zheng and Onur Mutlu", title = "{LazyPIM}: an Efficient Cache Coherence Mechanism for Processing-in-Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "46--50", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2577557", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Processing-in-memory (PIM) architectures cannot use traditional approaches to cache coherence due to the high off-chip traffic consumed by coherence messages. We propose LazyPIM, a new hardware cache coherence mechanism designed specifically for PIM. LazyPIM uses a combination of speculative cache coherence and compressed coherence signatures to greatly reduce the overhead of keeping PIM coherent with the processor. We find that LazyPIM improves average performance across a range of PIM applications by 49.1 percent over the best prior approach, coming within 5.5 percent of an ideal PIM mechanism.", acknowledgement = ack-nhfb, affiliation = "Boroumand, A (Reprint Author), Carnegie Mellon Univ, Pittsburgh, PA 15123 USA. Boroumand, Amirali; Ghose, Saugata; Patel, Minesh; Hassan, Hasan; Lucia, Brandon; Hsieh, Kevin; Mutlu, Onur, Carnegie Mellon Univ, Pittsburgh, PA 15123 USA. Hassan, Hasan, TOBB ETU Sogutozu, TR-06560 Ankara, Turkey. Malladi, Krishna T.; Zheng, Hongzhong, Samsung Semicond Inc, Milpitas, CA 95035 USA. Mutlu, Onur, ETH, Ramistr, CH-8092 Zurich, Switzerland.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "amirali@cmu.edu ghose@cmu.edu mineshp@andrew.cmu.edu hhasan@etu.edu.tr blucia@andrew.cmu.edu tsuwangh@andrew.cmu.edu k.tej@ssi.samsung.com hz.zheng@ssi.samsung.com omuthu@gmail.com", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; cache coherence mechanism; cache storage; Coherence; coherence messages; compressed coherence; Computer architecture; Kernel; LazyPIM mechanism; Message systems; PIM architecture; processing-in-memory; Programming; Random access memory; speculative cache coherence", keywords-plus = "CONSISTENCY", number-of-cited-references = "30", oa = "Bronze", research-areas = "Computer Science", times-cited = "8", unique-id = "Boroumand:2017:LEC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Gottscho:2017:MIM, author = "Mark Gottscho and Mohammed Shoaib and Sriram Govindan and Bikash Sharma and Di Wang and Puneet Gupta", title = "Measuring the Impact of Memory Errors on Application Performance", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "51--55", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2599513", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Memory reliability is a key factor in the design of warehouse-scale computers. Prior work has focused on the performance overheads of memory fault-tolerance schemes when errors do not occur at all, and when detected but uncorrectable errors occur, which result in machine downtime and loss of availability. We focus on a common third scenario, namely, situations when hard but correctable faults exist in memory; these may cause an ``avalanche'' of errors to occur on affected hardware. We expose how the hardware/software mechanisms for managing and reporting memory errors can cause severe performance degradation in systems suffering from hardware faults. We inject faults in DRAM on a real cloud server and quantify the single-machine performance degradation for both batch and interactive workloads. We observe that for SPEC CPU2006 benchmarks, memory errors can slow down average execution time by up to 2.5x. For an interactive web-search workload, average query latency degrades by up to 2.3x for a light traffic load, and up to an extreme 3746x under peak load. Our analyses of the memory error-reporting stack reveals architecture, firmware, and software opportunities to improve performance consistency by mitigating the worst-case behavior on faulty hardware.", acknowledgement = ack-nhfb, affiliation = "Gottscho, M (Reprint Author), Univ Calif Los Angeles, Dept Elect Engn, Los Angeles, CA 90095 USA. Gottscho, Mark; Gupta, Puneet, Univ Calif Los Angeles, Dept Elect Engn, Los Angeles, CA 90095 USA. Shoaib, Mohammed; Wang, Di, Microsoft Res, Redmond, WA 98052 USA. Govindan, Sriram; Sharma, Bikash, Microsoft, Redmond, WA 98052 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "mgottscho@ucla.edu shoaib@microsoft.com srgovin@microsoft.com bsharma@microsoft.com wangdi@microsoft.com puneet@ee.ucla.edu", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF Variability Expedition Grant [CCF-1029030]", funding-text = "This work was conducted jointly between Microsoft Corporation and the NanoCAD Lab of the Electrical Engineering Department at the University of California, Los Angeles (UCLA). The authors thank Dr. Jie Liu of Microsoft Research, and Dr. Badriddine Khessib and Dr. Kushagra Vaid of Microsoft for supporting this work while Mr. Gottscho was an intern at Microsoft Research in 2015. Funding came partly from the NSF Variability Expedition Grant No. CCF-1029030.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "application performance; availability; cloud; Degradation; DRAM; dynamic random-access storage; error-handling; fault tolerant computing; Hardware; hardware/software interface; hardware/software mechanisms; Instruction sets; interactive web-search workload; Main memory; memory errors; memory fault-tolerance schemes; memory reliability; performance consistency; Random access memory; random-access storage; RAS; reliability; Reliability; servers; Servers; servers; warehouse-scale computer design", keywords-plus = "VARIABILITY; RELIABILITY; SYSTEMS", number-of-cited-references = "32", oa = "Bronze", research-areas = "Computer Science", times-cited = "2", unique-id = "Gottscho:2017:MIM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Adileh:2017:MPH, author = "Almutaz Adileh and Stijn Eyerman and Aamer Jaleel and Lieven Eeckhout", title = "Mind The Power Holes: Sifting Operating Points in Power-Limited Heterogeneous Multicores", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "56--59", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2616339", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Heterogeneous chip multicore processors (HCMPs) equipped with multiple voltage-frequency (V-F) operating points provide a wide spectrum of power-performance tradeoff opportunities. This work targets the performance of HCMPs under a power cap. We show that for any performance optimization technique to work under power constraints, the default set of V-F operating points in HCMPs must be first filtered based on the application's power and performance characteristics. Attempting to find operating points of maximum performance by naively walking the default set of operating points leads the application to inefficient operating points which drain power without significant performance benefit. We call these points Power Holes (PH). Contrary to intuition, we show that even using a power-performance curve of Pareto-optimal operating points still degrades performance significantly for the same reason. We propose PH-Sifter, a fast and scalable technique that sifts the default set of operating points and eliminates power holes. We show significant performance improvement of PH-Sifter compared to Pareto sifting for three use cases: (i) maximizing performance for a single application, (ii) maximizing system throughput for multi-programmed workloads, and (iii) maximizing performance of a system in which a fraction of the power budget is reserved for a high-priority application. Our results show performance improvements of 13, 27, and 28 percent on average that reach up to 52, 91 percent, and 2.3x, respectively, for the three use cases.", acknowledgement = ack-nhfb, affiliation = "Adileh, A (Reprint Author), Univ Ghent, B-9052 Ghent, East Flanders, Belgium. Adileh, Almutaz; Eeckhout, Lieven, Univ Ghent, B-9052 Ghent, East Flanders, Belgium. Eyerman, Stijn, Intel Belgium, B-2550 Leuven, Kontich, Belgium. Jaleel, Aamer, Nvidia Res, Boston, MA 01886 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "almutaz.adileh@ugent.be stijn.eyerman@elis.ugent.be ajaleel@nvidia.com lieven.eeckhout@elis.ugent.be", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "European Research Council under the European Community's Seventh Framework Programme (FP7)/ERC grant [259295]", funding-text = "We thank the anonymous reviewers for their thoughtful feedback. This research is supported in part through the European Research Council under the European Community's Seventh Framework Programme (FP7/2007-2013)/ERC grant agreement no. 259295. This work was done while Stijn Eyerman was at Ghent University.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "HCMP performance; heterogeneous chip multicore processors; Heterogeneous multicores; high-priority application; Indexes; Legged locomotion; Multicore processing; multiple voltage-frequency operating points; multiprocessing systems; multiprogramming; optimal operating points; Optimization; Pareto optimisation; Pareto-optimal operating points; performance evaluation; performance maximization; performance optimization; PH-Sifter; power aware computing; Power Holes; power management; power-limited processors; power-performance curve; power-performance tradeoff opportunities; Program processors; Schedules; system throughput maximization; Throughput; V-F operating points", keywords-plus = "PERFORMANCE; DVFS", number-of-cited-references = "9", research-areas = "Computer Science", times-cited = "1", unique-id = "Adileh:2017:MPH", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Sasaki:2017:MPC, author = "Hiroshi Sasaki and Alper Buyuktosunoglu and Augusto Vega and Pradip Bose", title = "Mitigating Power Contention: a Scheduling Based Approach", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "60--63", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2572080", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Shared resource contention has been a major performance issue for CMPs. In this paper, we tackle the power contention problem in power constrained CMPs by considering and treating power as a first-class shared resource. Power contention occurs when multiple processes compete for power, and leads to degraded system performance. In order to solve this problem, we develop a shared resource contention-aware scheduling algorithm that mitigates the contention for power and the shared memory subsystem at the same time. The proposed scheduler improves system performance by balancing the shared resource usage among scheduling groups. Evaluation results across a variety of multiprogrammed workloads show performance improvements over a state-of-the-art scheduling policy which only considers memory subsystem contention.", acknowledgement = ack-nhfb, affiliation = "Sasaki, H (Reprint Author), Columbia Univ, Dept Comp Sci, New York, NY 10027 USA. Sasaki, Hiroshi, Columbia Univ, Dept Comp Sci, New York, NY 10027 USA. Buyuktosunoglu, Alper; Vega, Augusto; Bose, Pradip, IBM TJ Watson Res Ctr, New York, NY 10598 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "sasaki@cs.columbia.edu alperb@us.ibm.com ajvega@us.ibm.com pbose@us.ibm.com", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "JSPS Postdoctoral Fellowships for Research Abroad; Defense Advanced Research Projects Agency (DARPA), Microsystems Technology Office (MTO) [HR0011-13-C-0022]", funding-text = "This work is sponsored, in part, by JSPS Postdoctoral Fellowships for Research Abroad, and Defense Advanced Research Projects Agency (DARPA), Microsystems Technology Office (MTO), under contract number HR0011-13-C-0022. The views expressed are those of the authors and do not reflect the official policy or position of the Department of Defense or the U.S. Government.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; chip multiprocessors; energy-efficient systems; first-class shared resource; Memory management; memory subsystem contention; multi-core processors; multiprogrammed workloads; performance evaluation; power aware computing; power capping; power constrained CMP; Power contention; power contention problem; Power demand; process scheduling; processor scheduling; Processor scheduling; Random access memory; resource allocation; Scheduling; scheduling-based approach; shared memory systems; shared resource contention-aware scheduling algorithm; System performance", keywords-plus = "PERFORMANCE", number-of-cited-references = "15", oa = "Bronze", research-areas = "Computer Science", researcherid-numbers = "Sasaki, Hiroshi/N-8579-2019", times-cited = "1", unique-id = "Sasaki:2017:MPC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Marquez:2017:MCH, author = "David Gonzalez Marquez and Adrian Cristal Kestelman and Esteban Mocskos", title = "{Mth}: Codesigned Hardware\slash Software Support for Fine Grain Threads", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "64--67", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2606383", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multi-core processors are ubiquitous in all market segments from embedded to high performance computing, but only few applications can efficiently utilize them. Existing parallel frameworks aim to support thread-level parallelism in applications, but the imposed overhead prevents their usage for small problem instances. This work presents Micro-threads (Mth) a hardware-software proposal focused on a shared thread management model enabling the use of parallel resources in applications that have small chunks of parallel code or small problem inputs by a combination of software and hardware: delegation of the resource control to the application, an improved mechanism to store and fill processor's context, and an efficient synchronization system. Four sample applications are used to test our proposal: HSL filter (trivially parallel), FFT Radix2 (recursive algorithm), LU decomposition (barrier every cycle) and Dantzig algorithm (graph based, matrix manipulation). The results encourage the use of Mth and could smooth the use of multiple cores for applications that currently can not take advantage of the proliferation of the available parallel resources in each chip.", acknowledgement = ack-nhfb, affiliation = "Marquez, DG (Reprint Author), Univ Buenos Aires, Fac Ciencias Exactas \& Nat, Dept Comp Sci, C1428EGA, RA-1053 Buenos Aires, DF, Argentina. Marquez, David Gonzalez; Mocskos, Esteban, Univ Buenos Aires, Fac Ciencias Exactas \& Nat, Dept Comp Sci, C1428EGA, RA-1053 Buenos Aires, DF, Argentina. Mocskos, Esteban, CSC CONICET, C1425FQD, RA-2390 Buenos Aires, DF, Argentina. Kestelman, Adrian Cristal, CSIC, IIIA, Barcelona Supercomp Ctr, ES-08034 Barcelona, Spain. Kestelman, Adrian Cristal, Univ Politecn Cataluna, Dept Comp Architecture, ES-08034 Barcelona, Spain.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "dmarquez@dc.uba.ar adrian.cristal@bsc.es emocskos@dc.uba.ar", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Universidad de Buenos Aires [UBACyT 20020130200096BA]; CONICET [PIP 11220110100379]", funding-text = "This work was partially funded by grants from Universidad de Buenos Aires (UBACyT 20020130200096BA) and CONICET (PIP 11220110100379). The authors thank specially Osman Unsal for reading this article with fruitful criticism.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "barrier every cycle; codesigned hardware-software support; Dantzig algorithm; digital arithmetic; embedded processors; fast Fourier transforms; FFT Radix2 algorithm; fine grain threads; graph based algorithm; graph theory; hardware-software codesign; high performance computing; HSL filter; LU decomposition; matrix decomposition; matrix manipulation; Message systems; microthreads; Mirrors; Mth hardware/software support; multi-threading; multicore processing; multicore processors; multithreading; Parallel architectures; parallel architectures; Parallel architectures; parallel code; parallel frameworks; Parallel processing; parallel programming; parallel resources; Program processors; Proposals; recursive algorithm; Registers; resource control; shared memory systems; shared thread management model; Synchronization; synchronization system; thread-level parallelism support; trivially parallel filter", keywords-plus = "PARALLELISM", number-of-cited-references = "11", ORCID-numbers = "Mocskos, Esteban/0000-0002-6473-7672", research-areas = "Computer Science", times-cited = "0", unique-id = "Marquez:2017:MCH", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Morad:2017:ORO, author = "Tomer Y. Morad and Gil Shomron and Mattan Erez and Avinoam Kolodny and Uri C. Weiser", title = "Optimizing Read-Once Data Flow in Big-Data Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "68--71", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2520927", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Memory hierarchies in modern computing systems work well for workloads that exhibit temporal data locality. Data that is accessed frequently is brought closer to the computing cores, allowing faster access times, higher bandwidth, and reduced transmission energy. Many applications that work on big data, however, read data only once. When running these applications on modern computing systems, data that is not reused is nevertheless transmitted and copied into all memory hierarchy levels, leading to energy and bandwidth waste. In this paper we evaluate workloads dealing with read-once data and measure their energy consumption. We then modify the workloads so that data that is known to be used only once is transferred directly from storage into the CPU's last level cache, effectively bypassing DRAM and avoiding keeping unnecessary copies of the data. Our measurements on a real system show savings of up to 5 Watts in server power and up to 3.9 percent reduction in server energy when 160 GB of read-once data bypasses DRAM.", acknowledgement = ack-nhfb, affiliation = "Morad, TY (Reprint Author), Cornell Tech, Jacobs Technion Cornell Inst, 111 8th Ave, New York, NY 10011 USA. Morad, Tomer Y.; Shomron, Gil; Kolodny, Avinoam; Weiser, Uri C., Technion Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa, Israel. Morad, Tomer Y., Cornell Tech, Jacobs Technion Cornell Inst, 111 8th Ave, New York, NY 10011 USA. Erez, Mattan, Univ Texas Austin, Dept Elect \& Comp Engn, 201 E 24th St, C0803, POB 6-248, Austin, TX 78712 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "tomerm@tx.technion.ac.il gilsho@tx.technion.ac.il mattan.erez@utexas.edu kolodny@ee.technion.ac.il uri.weiser@ee.technion.ac.il", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Intel Collaborative Research Institute for Computational Intelligence (ICRI-CI)", funding-text = "This research was supported by the Intel Collaborative Research Institute for Computational Intelligence (ICRI-CI).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; bandwidth wastage; Big Data; Big-Data applications; cache storage; computing cores; CPU last-level cache; data access time; data flow computing; DRAM; energy consumption measure; Energy efficiency; Energy measurement; energy wastage; memory architecture; memory hierarchy levels; Memory management; Performance evaluation; Prefetching; Random access memory; read-once data flow optimization; reduced transmission energy; server energy reduction; Servers; temporal data locality", number-of-cited-references = "9", research-areas = "Computer Science", times-cited = "0", unique-id = "Morad:2017:ORO", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Yasoubi:2017:PEA, author = "Ali Yasoubi and Reza Hojabr and Mehdi Modarressi", title = "Power-Efficient Accelerator Design for Neural Networks Using Computation Reuse", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "72--75", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2521654", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Applications of neural networks in various fields of research and technology have expanded widely in recent years. In particular, applications with inherent tolerance to accuracy loss, such as signal processing and multimedia applications, are highly suited to the approximation property of neural networks. This approximation property has been exploited in many existing neural network accelerators to trade-off accuracy for power-efficiency and speed. In addition to the power saving obtained by approximation, we observed that a considerable amount of arithmetic operations in neural networks are repetitive and can be eliminated to further decrease power consumption. Given this observation, we propose CORN, COmputation Reuse-aware Neural network accelerator that allows neurons to share their computation results, effectively eliminating the power usage of redundant computations. We will show that CORN lowers power consumption by 26 percent on average over low-power neural network accelerators.", acknowledgement = ack-nhfb, affiliation = "Yasoubi, A (Reprint Author), Univ Tehran, Dept Elect \& Comp Engn, Coll Engn, Tehran, Iran. Yasoubi, Ali; Hojabr, Reza; Modarressi, Mehdi, Univ Tehran, Dept Elect \& Comp Engn, Coll Engn, Tehran, Iran.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "a.yosoubi@ut.ac.ir r.hojabr@ut.ac.ir modarressi@uti.ac.ir", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "approximation; arithmetic operations; Biological neural networks; Buffer storage; computation reuse; computation reuse-aware neural network accelerator; Computer architecture; CORN; energy conservation; hardware accelerator; low-power neural network accelerators; neural nets; Neural network; Neurons; power aware computing; Power demand; power usage elimination; power-efficiency; power-efficient accelerator design; Redundancy; redundant computations", keywords-plus = "RECOGNITION", number-of-cited-references = "14", research-areas = "Computer Science", times-cited = "1", unique-id = "Yasoubi:2017:PEA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Son:2017:SAS, author = "Young Hoon Son and Hyunyoon Cho and Yuhwan Ro and Jae W. Lee and Jung Ho Ahn", title = "{SALAD}: Achieving Symmetric Access Latency with Asymmetric {DRAM} Architecture", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "76--79", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2525760", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Memory access latency has significant impact on application performance. Unfortunately, the random access latency of DRAM has been scaling relatively slowly, and often directly affects the critical path of execution, especially for applications with insufficient locality or memory-level parallelism. The existing low-latency DRAM organizations either incur significant area overhead or burden the software stack with non-uniform access latency. This paper proposes SALAD, a new DRAM device architecture that provides symmetric access latency with asymmetric DRAM bank organizations. Since local banks have lower data transfer time due to their proximity to the I/O pads, SALAD applies high aspect-ratio (i.e., low-latency) mats only to remote banks to offset the difference in data transfer time, thus providing uniformly low access time (tAC) over the whole device. Our evaluation demonstrates that SALAD improves the IPC by 13 percent (10 percent) without any software modifications, while incurring only 6 percent (3 percent) area overhead.", acknowledgement = ack-nhfb, affiliation = "Son, YH (Reprint Author), Seoul Natl Univ, Seoul, South Korea. Son, Young Hoon; Cho, Hyunyoon; Ro, Yuhwan; Ahn, Jung Ho, Seoul Natl Univ, Seoul, South Korea. Lee, Jae W., Sungkyunkwan Univ, Seoul, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "yhson96@snu.ac.kr sumk40@snu.ac.kr yuhwanro@snu.ac.kr jaewlee@skku.edu gajh@snu.ac.kr", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Research Foundation of Korea - Korea government [NRF-2015M3C4A7065647]; ICT R\&D program of MSIP/IITP [KI001810041244]", funding-text = "This work was partially supported by the National Research Foundation of Korea grant funded by the Korea government (NRF-2015M3C4A7065647) and ICT R\&D program of MSIP/IITP (KI001810041244).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "asymmetric bank organizations; asymmetric DRAM bank organizations; Data transfer; data transfer time; DRAM; DRAM chips; DRAM device architecture; I/O pads; memory architecture; Memory management; microarchitecture; Organizations; Parallel processing; Random access memory; SALAD; Software; symmetric access latency with asymmetric DRAM; uniformly low access time", number-of-cited-references = "20", research-areas = "Computer Science", times-cited = "1", unique-id = "Son:2017:SAS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Judd:2017:SBS, author = "Patrick Judd and Jorge Albericio and Andreas Moshovos", title = "{Stripes}: Bit-Serial Deep Neural Network Computing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "80--83", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2597140", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The numerical representation precision required by the computations performed by Deep Neural Networks (DNNs) varies across networks and between layers of a same network. This observation motivates a precision-based approach to acceleration which takes into account both the computational structure and the required numerical precision representation. This work presents Stripes (STR), a hardware accelerator that uses bit-serial computations to improve energy efficiency and performance. Experimental measurements over a set of state-of-the-art DNNs for image classification show that STR improves performance over a state-of-the-art accelerator from 1.35x to 5.33x and by 2.24x on average. STR's area and power overhead are estimated at 5 percent and 12 percent respectively. STR is 2.00x more energy efficient than the baseline.", acknowledgement = ack-nhfb, affiliation = "Judd, P (Reprint Author), Univ Toronto, Edward S Rogers Sr Dept Elect \& Comp Engn, Toronto, ON M5S 3H7, Canada. Judd, Patrick; Albericio, Jorge; Moshovos, Andreas, Univ Toronto, Edward S Rogers Sr Dept Elect \& Comp Engn, Toronto, ON M5S 3H7, Canada.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "patrick.judd@mail.utoronto.ca jorge@ece.utoronto.ca moshovos@eecg.toronto.edu", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Artificial neural networks; bit-serial computations; bit-serial deep neural network computing; convolution; deep learning; deep neural networks; energy efficiency; Graphics processing units; Hardware acceleration; image classification; learning (artificial intelligence); neural nets; Neurons; Nickel; numerical representation; Parallel processing; precision-based approach; serial computing; STR; Stripes; Three-dimensional displays", number-of-cited-references = "19", research-areas = "Computer Science", times-cited = "5", unique-id = "Judd:2017:SBS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Ravi:2017:TSM, author = "Gokul Subramanian Ravi and Mikko Lipasti", title = "Timing Speculation in Multi-Cycle Data Paths", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "84--87", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2580501", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Modern processors set timing margins conservatively at design time to support extreme variations in workload and environment, in order to operate reliably and produce expected outputs. Unfortunately, the conservative guard bands set to achieve this reliability are detrimental to processor performance and energy efficiency. In this paper, we propose the use of processors with internal transparent pipelines, which allow data to flow between stages without latching, to maximize timing speculation efficiency as they are inherently suited to slack conservation. We design a synchronous tracking mechanism which runs in parallel with the multi-cycle data path to estimate the accumulated slack across instructions/pipeline stages and then appropriately clock synchronous boundaries early to minimize wasted slack and achieve maximum clock cycle savings. Preliminary evaluations atop the CRIB processor show performance improvements of greater than 10\% on average and as high as 30\% for an assumed 25\% slack per clock cycle.", acknowledgement = ack-nhfb, affiliation = "Ravi, GS (Reprint Author), Univ Wisconsin, Dept Elect \& Comp Engn, 1415 Johnson Dr, Madison, WI 53706 USA. Ravi, Gokul Subramanian; Lipasti, Mikko, Univ Wisconsin, Dept Elect \& Comp Engn, 1415 Johnson Dr, Madison, WI 53706 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "gravi@wisc.edu mikko@engr.wisc.edu", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "clock cycle savings; clocks; Clocks; CRIB; CRIB processor; internal transparent pipelines; microprocessor chips; multi-cycle datapath; multicycle data paths; parallel processing; parallel synchronous tracking mechanism; pipeline processing; Pipelines; Program processors; Proposals; Registers; Reliability; slack; Timing; Timing speculation; timing speculation", number-of-cited-references = "8", oa = "Bronze", research-areas = "Computer Science", times-cited = "0", unique-id = "Ravi:2017:TSM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Khan:2017:CMC, author = "Samira Khan and Chris Wilkerson and Donghyuk Lee and Alaa R. Alameldeen and Onur Mutlu", title = "A Case for Memory Content-Based Detection and Mitigation of Data-Dependent Failures in {DRAM}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "88--93", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2624298", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "DRAM cells in close proximity can fail depending on the data content in neighboring cells. These failures are called data-dependent failures. Detecting and mitigating these failures online while the system is running in the field enables optimizations that improve reliability, latency, and energy efficiency of the system. All these optimizations depend on accurately detecting every possible data-dependent failure that could occur with any content in DRAM. Unfortunately, detecting all data-dependent failures requires the knowledge of DRAM internals specific to each DRAM chip. As internal DRAM architecture is not exposed to the system, detecting data-dependent failures at the system-level is a major challenge. Our goal in this work is to decouple the detection and mitigation of data-dependent failures from physical DRAM organization such that it is possible to detect failures without knowledge of DRAM internals. To this end, we propose MEMCON, a memory content-based detection and mitigation mechanism for data-dependent failures in DRAM. MEMCON does not detect every possible data-dependent failure. Instead, it detects and mitigates failures that occur with the current content in memory while the programs are running in the system. Using experimental data from real machines, we demonstrate that MEMCON is an effective and low-overhead system-level detection and mitigation technique for data-dependent failures in DRAM.", acknowledgement = ack-nhfb, affiliation = "Khan, S (Reprint Author), Univ Virginia, Charlottesville, VA 22903 USA. Khan, Samira, Univ Virginia, Charlottesville, VA 22903 USA. Wilkerson, Chris; Alameldeen, Alaa R., Intel Labs, Santa Clara, CA 95054 USA. Lee, Donghyuk; Mutlu, Onur, Carnegie Mellon Univ, Pittsburgh, PA 15213 USA. Mutlu, Onur, ETH, CH-8092 Zurich, Switzerland.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "samirakhan@virginia.edu chris.wilkerson@intel.com donghyu1@cmu.edu alaa.r.alameldeen@intel.com onur@cmu.edu", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "ISTC-CC, an US National Science Foundation [CCF-0953246]; US National Science Foundation [CCF-1212962, CNS-1320531, CCF-1566483]", funding-text = "We thank anonymous reviewers and SAFARI group members for feedback. We acknowledge the support of Google, Intel, Nvidia, Seagate, and Samsung. This research was supported in part by the ISTC-CC, an US National Science Foundation CAREER Award (CCF-0953246), and US National Science Foundation grants (CCF-1212962, CNS-1320531, and CCF-1566483).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Content management; data content; data dependent failure; data-dependent failures; DRAM; DRAM cells; DRAM chips; DRAM internals; DRAM, data dependent failure, system-level testing; failure analysis; Failure analysis; integrated circuit reliability; Interference; low-overhead system-level detection technique; low-overhead system-level migration technique; MEMCON; memory content-based detection; memory content-based migration; neighboring cells; optimisation; physical DRAM organization; System-level design; system-level testing; Testing", keywords-plus = "NOISE", number-of-cited-references = "42", research-areas = "Computer Science", times-cited = "0", unique-id = "Khan:2017:CMC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Mittal:2017:ARD, author = "Sparsh Mittal and Jeffrey S. Vetter and Lei Jiang", title = "Addressing Read-Disturbance Issue in {STT--RAM} by Data Compression and Selective Duplication", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "94--98", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2645207", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "In deep sub-micron region, spin transfer torque RAM (STT-RAM) shows read-disturbance error (RDE) which presents a crucial reliability challenge. We present SHIELD, a technique to mitigate RDE in STT-RAM last level caches (LLCs). SHIELD uses data compression to reduce cache-write traffic and restore requirement. Also, SHIELD keeps two copies of data blocks compressed to less than half the block size and since several LLC blocks are only accessed once, this approach avoids several restore operations. SHIELD consumes smaller energy than two previous RDE-mitigation techniques, namely high-current restore required read (HCRR, also called restore-after-read) and low-current long latency read (LCLL) and even an ideal RDE-free STT-RAM cache.", acknowledgement = ack-nhfb, affiliation = "Mittal, S (Reprint Author), IIT Hyderabad, Sangareddy 502285, Telangana, India. Mittal, Sparsh, IIT Hyderabad, Sangareddy 502285, Telangana, India. Vetter, Jeffrey S., Oak Ridge Natl Lab, Oak Ridge, TN 37830 USA. Jiang, Lei, Indiana Univ, Bloomington, IN 47405 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "sparsh0mittal@gmail.com vetter@ornl.gov jiang60@iu.edu", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "U.S. Department of Energy, Office of Science, Advanced Scientific Computing Research", funding-text = "Support for this work was provided by the U.S. Department of Energy, Office of Science, Advanced Scientific Computing Research.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache storage; data blocks; data compression; Data compression; data compression; deep sub-micron region; duplication; Encoding; Error analysis; Error correction codes; HCRR; ideal RDE-free STT-RAM cache; integrated circuit reliability; last level cache; last level caches; LCLL; LLC; low-current long latency read; Magnetic tunneling; Non-volatile memory; Nonvolatile memory; Random access memory; random-access storage; read disturbance error; read-disturbance error; restore-after-read; selective duplication; SHIELD; spin transfer torque RAM; STT-RAM; transfer torque RAM", number-of-cited-references = "14", ORCID-numbers = "Mittal, Sparsh/0000-0002-2908-993X", research-areas = "Computer Science", times-cited = "1", unique-id = "Mittal:2017:ARD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Bakhshalipour:2017:ETD, author = "Mohammad Bakhshalipour and Pejman Lotfi-Kamran and Hamid Sarbazi-Azad", title = "An Efficient Temporal Data Prefetcher for {L1} Caches", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "99--102", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2654347", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Server workloads frequently encounter L1-D cache misses, and hence, lose significant performance potential. One way to reduce the number of L1-D misses or their effect is data prefetching. As L1-D access sequences have high temporal correlations, temporal prefetching techniques are promising for L1 caches. State-of-the-art temporal prefetching techniques are effective at reducing the number of L1-D misses, but we observe that there is a significant gap between what they offer and the opportunity. This work aims to improve the effectiveness of temporal prefetching techniques. To overcome the deficiencies of existing temporal prefetchers, we introduce Domino prefetching. Domino prefetcher is a temporal prefetching technique that looks up the history to find the last occurrence of the last one or two L1-D miss addresses for prefetching. We show that Domino prefetcher captures more than 87 percent of the temporal opportunity at L1-D. Through evaluation of a 16-core processor on a set of server workloads, we show that Domino prefetcher improves system performance by 26 percent (up to 56 percent).", acknowledgement = ack-nhfb, affiliation = "Bakhshalipour, M (Reprint Author), Sharif Univ Technol, Dept Comp Engn, Tehran 1458889694, Iran. Bakhshalipour, Mohammad; Sarbazi-Azad, Hamid, Sharif Univ Technol, Dept Comp Engn, Tehran 1458889694, Iran. Sarbazi-Azad, Hamid, Inst Res Fundamental Sci IPM, Sch Comp Sci, Tehran 1956836681, Iran.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "bakhshalipour@ce.sharif.edu plotfi@ipm.ir azad@ipm.ir", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache storage; Correlation; data prefetching; Domino prefetcher captures; efficient temporal data prefetcher; high temporal correlations; L1-D access sequences; L1-D cache misses; L1-D miss addresses; L1-D misses; multiprocessing systems; Prefetching; Server workloads; Servers; storage management; Streaming media; temporal correlation; temporal opportunity; temporal prefetching technique; Web search", number-of-cited-references = "10", research-areas = "Computer Science", times-cited = "2", unique-id = "Bakhshalipour:2017:ETD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Martinez:2017:SII, author = "Jorge A. Mart{\'\i}nez and Juan Antonio Maestro and Pedro Reviriego", title = "A Scheme to Improve the Intrinsic Error Detection of the Instruction Set Architecture", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "103--106", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2623628", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The Instruction Set Architecture (ISA) determines the effect that a soft error on an instruction can have on the processor. Previous works have shown that the ISA has some intrinsic capability of detecting errors. For example, errors that change a valid instruction into an invalid instruction encoding or into an instruction that causes an exception. The percentage of detectable errors varies widely for each bit in the ISA. For example, errors on bits that are used for immediate or register values are unlikely to be detected while those that are used for the opcode are more likely to lead to an exception. In this paper, this is exploited by introducing a simple encoding of the instructions that does not require additional bits. The idea is that the decoding propagates the error so that it affects the most sensitive bit of the ISA and therefore it is more likely to be detected. As no additional bits are required, no changes or overheads are needed in the memory. The proposed scheme is useful when the memory is not protected with parity or Error Correction Codes. The only cost of implementing the technique are simple encoder and decoder circuits that are similar to a parity computation. This technique is applicable to any ISA, no matter the length of the opcodes or their location in the instruction encoding. The effectiveness of the proposed scheme has been evaluated on the ARM Cortex M0 ISA resulting in an increase in the error detection capability of up to 1.64x.", acknowledgement = ack-nhfb, affiliation = "Martinez, JA (Reprint Author), Univ Antonio Nebrija, C Pirineos 55, Madrid 28040, Spain. Martinez, Jorge A.; Antonio Maestro, Juan; Reviriego, Pedro, Univ Antonio Nebrija, C Pirineos 55, Madrid 28040, Spain.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "jmartine@nebrija.es jmaestro@nebrija.es previrie@nebrija.es", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "ARM Cortex M0 ISA; Circuit faults; Computer architecture; decoder circuits; detectable errors; detecting errors; Encoding; Error analysis; error correction codes; Error Correction Codes; error detection; error detection capability; instruction set architecture; Instruction sets; instruction sets; intrinsic capability; intrinsic error detection; invalid instruction encoding; microprocessor chips; simple encoder; simple encoding; Soft error; soft error; Soft error", number-of-cited-references = "10", ORCID-numbers = "Maestro, Juan Antonio/0000-0001-7133-9026", research-areas = "Computer Science", researcherid-numbers = "Maestro, Juan Antonio/L-6091-2014", times-cited = "3", unique-id = "Martinez:2017:SII", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Wang:2017:DAS, author = "Rujia Wang and Sparsh Mittal and Youtao Zhang and Jun Yang", title = "{Decongest}: Accelerating Super-Dense {PCM} Under Write Disturbance by Hot Page Remapping", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "107--110", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2675883", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "At small feature sizes, phase change memory (PCM) shows write disturbance (WD) error (WDE) and this issue can eclipse the density and energy efficiency advantage of PCM. We propose `Decongest', a technique to address WD errors in main memory designed with super-dense (4F(2) cell size) PCM. Decongest works by identifying and remapping write-intensive hot pages to a WD-free spare area, which avoids WD to nearby pages due to writing these hot pages, and WD to these hot pages from writing nearby pages. Compared to a WD-affected super-dense PCM baseline, Decongest improves the performance by 14.0 percent, and saves 21.8 percent energy.", acknowledgement = ack-nhfb, affiliation = "Wang, RJ (Reprint Author), Univ Pittsburgh, Pittsburgh, PA 15260 USA. Wang, Rujia; Zhang, Youtao; Yang, Jun, Univ Pittsburgh, Pittsburgh, PA 15260 USA. Mittal, Sparsh, IIT Hyderabad, Kandi 502285, Telangana, India.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "rujia.w@pitt.edu sparsh0mittal@gmail.com youtao@pitt.edu juy9@pitt.edu", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "US NSF CCF [1617071]; IIT, Hyderabad, India", funding-text = "This work is partially supported by US NSF CCF\#1617071 and a seed-grant from IIT, Hyderabad, India.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; Diseases; Energy management; energy saving; main memory; Microprocessors; page remapping; Phase change materials; Phase change memory; Radiation detectors; reliability; write disturbance", number-of-cited-references = "13", ORCID-numbers = "Mittal, Sparsh/0000-0002-2908-993X", research-areas = "Computer Science", times-cited = "0", unique-id = "Wang:2017:DAS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Tanimoto:2017:EDG, author = "Teruo Tanimoto and Takatsugu Ono and Koji Inoue and Hiroshi Sasaki", title = "Enhanced Dependence Graph Model for Critical Path Analysis on Modern Out-of-Order Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "111--114", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2684813", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The dependence graph model of out-of-order (OoO) instruction execution is a powerful representation used for the critical path analysis. However most, if not all, of the previous models are out-of-date and lack enough detail to model modern OoO processors, or are too specific and complicated which limit their generality and applicability. In this paper, we propose an enhanced dependence graph model which remains simple but greatly improves the accuracy over prior models. The evaluation results using the gem5 simulator show that the proposed enhanced model achieves CPI error of 2.1 percent which is a 90.3 percent improvement against the state-of-the-art model.", acknowledgement = ack-nhfb, affiliation = "Tanimoto, T (Reprint Author), Kyushu Univ, Fukuoka 8190395, Japan. Tanimoto, Teruo; Ono, Takatsugu; Inoue, Koji, Kyushu Univ, Fukuoka 8190395, Japan. Sasaki, Hiroshi, Columbia Univ, New York, NY 10027 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "teruo.tanimoto@cpc.ait.kyushu-u.ac.jp takatsugu.ono@cpc.ait.kyushu-u.ac.jp inoue@ait.kyushu-u.ac.jp sasaki@cs.columbia.edu", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "CREST, JST", funding-text = "This work was supported in part by CREST, JST. We would like to express our thanks to RIIT of Kyushu University for providing us the resource to conduct the experiments in this paper.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Analytical models; Benchmark testing; computer architecture; critical path analysis; Delays; Dependence graph model; enhanced dependence graph model; graph theory; Hidden Markov models; Microarchitecture; modern OoO processors; out-of-order instruction execution; out-of-order processors; parallel architectures; Path planning; pipeline processing; Program processors", number-of-cited-references = "14", research-areas = "Computer Science", researcherid-numbers = "Sasaki, Hiroshi/N-8579-2019", times-cited = "0", unique-id = "Tanimoto:2017:EDG", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Lee:2017:FFE, author = "Junghee Lee and Kalidas Ganesh and Hyuk-Jun Lee and Youngjae Kim", title = "{FESSD}: a Fast Encrypted {SSD} Employing On-Chip Access-Control Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "115--118", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2667639", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Cryptography is one of the most popular methods for protecting data stored in storage devices such as solid-state drives (SSDs). To maintain integrity of data, one of the popular techniques is that all incoming data are encrypted before they are stored, however, in this technique, the encryption overhead is non-negligible and it can increase I/O service time. In order to mitigate the negative performance impact caused by the data encryption, a write buffer can be used to hide the long latency by encryption. Using the write buffer, incoming unencrypted data can be immediately returned as soon as they are written in the buffer. They will get encrypted and synchronized with flash memory. However, if the write buffer itself is not encrypted, unencrypted secret data might leak through this insecure write buffer. On the other hand, if the entire write buffer is fully encrypted, it incurs significant performance overhead. To address this problem, we propose an on-chip access control memory (ACM) and presents a fast encrypted SSD, called FESSD that implements a secure write buffering mechanism using the ACM. The ACM does not require a memory-level full encryption mechanism, thus not only solving the unencrypted data leaking problem, but also offering relatively fast I/O service. Our simulation results show that the I/O response time of FESSD can be improved by up to 56 percent over a baseline where encrypted data are stored in the normal write buffer.", acknowledgement = ack-nhfb, affiliation = "Lee, J (Reprint Author), Univ Texas San Antonio, San Antonio, TX 78249 USA. Lee, Junghee; Ganesh, Kalidas, Univ Texas San Antonio, San Antonio, TX 78249 USA. Lee, Hyuk-Jun; Kim, Youngjae, Sogang Univ, Seoul 121742, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "junghee.lee@my.utsa.edu dyk567@my.utsa.edu hyukjunl@sogang.ac.kr youkim@sogang.ac.kr", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Research Foundation of Korea (NRF) --- Korea Government (MISP) [2015R1C1A1A0152105]", funding-text = "This work was supported by the National Research Foundation of Korea (NRF) grant funded by the Korea Government (MISP) (No. 2015R1C1A1A0152105). This research also used resources of The University of Texas at San Antonio, San Antonio, TX. Youngjae Kim is the corresponding author.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "ACM; authorisation; cryptography; data encryption; encrypted data; encryption; Encryption; encryption; encryption overhead; fast encrypted SSD; FeSSD; flash memories; flash memory; Hardware; negative performance impact; Nonvolatile memory; normal write buffer; on-chip access control memory; on-chip access-control memory; on-chip memory; Registers; security; Solid-state drive (SSD); solid-state drives; storage devices; storage management; System-on-chip; unencrypted data leaking problem; unencrypted secret data", keywords-plus = "SECURITY", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "0", unique-id = "Lee:2017:FFE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Badawy:2017:GLO, author = "Abdel-Hameed A. Badawy and Donald Yeung", title = "Guiding Locality Optimizations for Graph Computations via Reuse Distance Analysis", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "119--122", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2695178", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This work addresses the problem of optimizing graph-based programs for multicore processors. We use three graph benchmarks and three input data sets to characterize the importance of properly partitioning graphs among cores at multiple levels of the cache hierarchy. We also exhaustively explore a large design space comprised of different parallelization schemes and graph partitionings via detailed simulation to show how much gain we can obtain over a baseline legacy scheme that partitions for the L1 cache only. Our results demonstrate the legacy approach is not the best choice, and that our proposed parallelization / locality techniques can perform better (by up to 20 percent). We then use a performance prediction model based on multicore reuse distance (RD) profiles to rank order the different parallelization / locality schemes in the design space. We compare the best configuration as predicted by our model against the actual best identified by our exhaustive simulations. For one benchmark and data input, we show our model can achieve 79.5 percent of the performance gain achieved by the actual best. Across all benchmarks and data inputs, our model achieves 48 percent of the maximum performance gain. Our work demonstrates a new use case for multicore RD profiles --- i.e., as a tool for helping program developers and compilers to optimize graph-based programs.", acknowledgement = ack-nhfb, affiliation = "Badawy, AHA (Reprint Author), New Mexico State Univ, Klipsch Sch Elect \& Comp Engn, Las Cruces, NM 88003 USA. Badawy, Abdel-Hameed A., New Mexico State Univ, Klipsch Sch Elect \& Comp Engn, Las Cruces, NM 88003 USA. Yeung, Donald, Univ Maryland, Dept Elect \& Comp Engn, College Pk, MD 20742 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "badawy@nmsu.edu yeung@umd.edu", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "baseline legacy scheme; Benchmark testing; cache hierarchy; cache storage; Computational modeling; graph benchmarks; graph computations; graph partitionings; graph theory; legacy approach; locality optimization; memory system; Multicore processing; multicore processors; multicore RD profiles; multicore reuse distance profiles; multiprocessing systems; Optimization; partitioning; performance prediction model; prediction; Predictive models; profiling; program developers; Program processors; reuse distance; reuse distance analysis; Runtime", keywords-plus = "BIOMOLECULAR SIMULATION", number-of-cited-references = "11", ORCID-numbers = "Badawy, Abdel-Hameed/0000-0001-8027-1449", research-areas = "Computer Science", times-cited = "0", unique-id = "Badawy:2017:GLO", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Zha:2017:IFM, author = "Yue Zha and Jing Li", title = "{IMEC}: a Fully Morphable In-Memory Computing Fabric Enabled by Resistive Crossbar", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "123--126", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2672558", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", abstract = "In this paper, we propose a fully morphable In-MEmory Computing (IMEC) fabric to better implement the concept of processing inside memory (PIM). Enabled by emerging nonvolatile memory, i.e., RRAM and its monolithic 3D integration, IMEC can be configured into one or a combination of four distinct functions, (1) logic, (2) ternary content addressable memory, (3) memory, and (4) interconnect. Thus, IMEC exploits a continuum of PIM capabilities across the whole spectrum, ranging from 0 percent (pure data storage) to 100 percent (pure compute engine), or intermediate states in between. IMEC can be modularly integrated into the DDRx memory subsystem, communicating with processors by the ordinary DRAM commands. Additionally, to reduce the programming burden, we provide a complete framework to compile applications written in high-level programming language (e.g., OpenCL) onto IMEC. This framework also enables code portability across different platforms for heterogeneous computing. By using this framework, several benchmarks are mapped onto IMEC for evaluating its performance, energy and resource utilization. The simulation results show that, IMEC reduces the energy consumption by 99.6 percent, and achieves 644x speedup, compared to a baseline CPU system. We further compare IMEC with FPGA architecture, and demonstrate that the performance improvement is not simply obtained by replacing SRAM cells with denser RRAM cells.", acknowledgement = ack-nhfb, affiliation = "Zha, Y (Reprint Author), Univ Wisconsin, Elect \& Comp Engn Dept, Madison, WI 53706 USA. Zha, Yue; Li, Jing, Univ Wisconsin, Elect \& Comp Engn Dept, Madison, WI 53706 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "yzha3@wisc.edu jli587@wisc.edu", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Decoding; Energy efficiency; energy-efficiency computing; Field programmable gate arrays; Non-volatile memory; Nonvolatile memory; processing-in-memory; Program processors; TCAM", keywords-plus = "ARCHITECTURE", number-of-cited-references = "20", research-areas = "Computer Science", times-cited = "1", unique-id = "Zha:2017:IFM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Chen:2017:IGP, author = "Li-Jhan Chen and Hsiang-Yun Cheng and Po-Han Wang and Chia-Lin Yang", title = "Improving {GPGPU} Performance via Cache Locality Aware Thread Block Scheduling", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "127--131", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2693371", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Modern GPGPUs support the concurrent execution of thousands of threads to provide an energy-efficient platform. However, the massive multi-threading of GPGPUs incurs serious cache contention, as the cache lines brought by one thread can easily be evicted by other threads in the small shared cache. In this paper, we propose a software-hardware cooperative approach that exploits the spatial locality among different thread blocks to better utilize the precious cache capacity. Through dynamic locality estimation and thread block scheduling, we can capture more performance improvement opportunities than prior work that only explores the spatial locality between consecutive thread blocks. Evaluations across diverse GPGPU applications show that, on average, our locality-aware scheduler provides 25 and 9 percent performance improvement over the commonly-employed round-robin scheduler and the state-of-the-art scheduler, respectively.", acknowledgement = ack-nhfb, affiliation = "Chen, LJ (Reprint Author), Natl Taiwan Univ, Taipei 10617, Taiwan. Chen, Li-Jhan; Wang, Po-Han; Yang, Chia-Lin, Natl Taiwan Univ, Taipei 10617, Taiwan. Cheng, Hsiang-Yun, Acad Sinica, Taipei 11529, Taiwan.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "r03922026@csie.ntu.edu.tw hycheng@citi.sinica.edu.tw f96922002@csie.ntu.edu.tw yangc@csie.ntu.edu.tw", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Ministry of Science and Technology of Taiwan [MOST-105-2221-E-002-156-MY2, MOST-105-2622-8-002-002, MOST-105-2218-E-002-025]; MediaTek Inc., Hsin-chu, Taiwan", funding-text = "This work is supported in part by research grants from the Ministry of Science and Technology of Taiwan (MOST-105-2221-E-002-156-MY2, MOST-105-2622-8-002-002, and MOST-105-2218-E-002-025), and sponsored by MediaTek Inc., Hsin-chu, Taiwan.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache lines; cache locality; cache locality aware thread block scheduling; Cache memory; cache storage; consecutive thread blocks; Dispatching; dynamic locality estimation; energy-efficient platform; GPGPU; GPGPU performance; graphics processing units; Graphics processing units; Instruction sets; locality-aware scheduler; multi-threading; performance improvement opportunities; precious cache capacity; processor scheduling; serious cache contention; shared cache; thread block scheduling; Two dimensional displays", number-of-cited-references = "18", research-areas = "Computer Science", times-cited = "0", unique-id = "Chen:2017:IGP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Garland:2017:LCM, author = "James Garland and David Gregg", title = "Low Complexity Multiply Accumulate Unit for Weight-Sharing Convolutional Neural Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "132--135", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2656880", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Convolutional Neural Networks (CNNs) are one of the most successful deep machine learning technologies for processing image, voice and video data. CNNs require large amounts of processing capacity and memory, which can exceed the resources of low power mobile and embedded systems. Several designs for hardware accelerators have been proposed for CNNs which typically contain large numbers of Multiply Accumulate (MAC) units. One approach to reducing data sizes and memory traffic in CNN accelerators is ``weight sharing'', where the full range of values in a trained CNN are put in bins and the bin index is stored instead of the original weight value. In this paper we propose a novel MAC circuit that exploits binning in weight-sharing CNNs. Rather than computing the MAC directly we instead count the frequency of each weight and place it in a bin. We then compute the accumulated value in a subsequent multiply phase. This allows hardware multipliers in the MAC circuit to be replaced with adders and selection logic. Experiments show that for the same clock speed our approach results in fewer gates, smaller logic, and reduced power.", acknowledgement = ack-nhfb, affiliation = "Garland, J (Reprint Author), Trinity Coll Dublin, Dublin 2, Ireland. Garland, James; Gregg, David, Trinity Coll Dublin, Dublin 2, Ireland.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "jgarland@tcd.ie david.gregg@cs.tcd.ie", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Science Foundation Ireland [12/IA/1381]", funding-text = "This research is supported by Science Foundation Ireland, Project 12/IA/1381.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "adders; arithmetic hardware circuits; bin index; CNN accelerators; convolution; Convolutional neural network; Convolutional neural networks; deep machine learning technologies; embedded systems; Energy efficiency; feedforward neural nets; hardware accelerators; hardware multipliers; learning (artificial intelligence); Logic gates; MAC circuit; Machine learning; memory traffic; multiply accumulate; multiply accumulate units; multiplying circuits; Neural networks; original weight value; power efficiency; subsequent multiply phase; video data; weight-sharing CNN; weight-sharing convolutional neural networks", number-of-cited-references = "9", ORCID-numbers = "Garland, James/0000-0002-8688-9407", research-areas = "Computer Science", researcherid-numbers = "Garland, James/L-1294-2019", times-cited = "2", unique-id = "Garland:2017:LCM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Jung:2017:NIP, author = "Myoungsoo Jung", title = "{NearZero}: an Integration of Phase Change Memory with Multi-Core Coprocessor", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "136--140", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2694828", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Multi-core based coprocessors have become powerful research vehicles to analyze a large amount of data. Even though they can accelerate data processing by using a hundred cores, the data unfortunately exist on an external storage device. The separation of computation and storage introduces redundant memory copies and unnecessary data transfers over different physical device boundaries, which limit the benefits of coprocessor-accelerated data processing. In addition, the coprocessors need assistance from host-side resources to access the external storage, which can require additional system context switches. To address these challenges, we propose NearZero, a novel DRAM-less coprocessor architecture that precisely integrates a state-of-the-art phase change memory into its multi-core accelerator. In this work, we implement an FPGA-based memory controller that extracts important device parameters from real phase change memory chips, and apply them to a commercially available hardware platform that employs multiple processing elements over a PCIe fabric. The evaluation results reveal that NearZero achieves on average 47 percent better performance than advanced coprocessor approaches that use direct I/Os (between storage and coprocessors), while consuming only 19 percent of the total energy of such advanced coprocessors.", acknowledgement = ack-nhfb, affiliation = "Jung, M (Reprint Author), Yonsei Univ, Seoul 03722, South Korea. Jung, Myoungsoo, Yonsei Univ, Seoul 03722, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "m.jung@yonsei.ac.kr", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NRF [2016R1C1B2015312, DE-AC02-05CH 11231]; MSIP [IITP-2017-2017-0-01015]; [MemRay 2015-11-1731]", funding-text = "The author thanks MemRay Corporation, Samsung, TI for their research sample donation and technical support. The author also thanks J. Zhang, H. Jeong and G. Park who help him prepare to set up preliminary evaluation environment. This research is supported by MemRay 2015-11-1731. This work is also supported in part by NRF 2016R1C1B2015312, DE-AC02-05CH 11231 and MSIP IITP-2017-2017-0-01015.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "accelerators; additional system context; advanced coprocessor approaches; Computer architecture; coprocessors; Coprocessors; data processing; Data storage; Data transfer; DRAM chips; DRAM-less coprocessor architecture; external storage device; Field programmable gate arrays; field programmable gate arrays; hardware architecture; host-side resources; hybrid systems; important device parameters; mass storage; memory structures; multicore accelerator; multicore-based coprocessors; multiple processing elements; multiprocessing systems; multiprocessors; NearZero; Network architecture; non-volatile memory; Nonvolatile memory; parallel architectures; phase change memories; phase change memory chips; Phase change random access memory; powerful research vehicles; redundant memory copies; Storage devices; storage management; unnecessary data transfers", number-of-cited-references = "12", research-areas = "Computer Science", researcherid-numbers = "Jung, Myoungsoo/F-4565-2019", times-cited = "2", unique-id = "Jung:2017:NIP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Yavits:2017:RAD, author = "Leonid Yavits and Uri Weiser and Ran Ginosar", title = "Resistive Address Decoder", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "141--144", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2670539", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Hardwired dynamic NAND address decoders are widely used in random access memories to decode parts of the address. Replacing wires by resistive elements allows storing and reprogramming the addresses and matching them to an input address. The resistive address decoder thus becomes a content addressable memory, while the read latency and dynamic energy remain almost identical to those of a hardwired address decoder. One application of the resistive address decoder is a fully associative TLB with read latency and energy consumption similar to those of a one-way associative TLB. Another application is a many-way associative cache with read latency and energy consumption similar to those of a direct mapped one. A third application is elimination of physical addressing and using virtual addresses throughout the entire memory hierarchy by introducing the resistive address decoder into the main memory.", acknowledgement = ack-nhfb, affiliation = "Yavits, L (Reprint Author), Technion Israel Inst Technol, Dept Elect Engn, IL-3200000 Haifa, Israel. Yavits, Leonid; Weiser, Uri; Ginosar, Ran, Technion Israel Inst Technol, Dept Elect Engn, IL-3200000 Haifa, Israel.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "yavits@tx.technion.ac.il uri.weiser@ee.technion.ac.il ran@ee.technion.ac.il", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Address decoder; cache; cache storage; CAM; content addressable memory; content-addressable storage; Decoding; decoding; dynamic energy; energy consumption; Energy consumption; fully associative TLB; hardwired address decoder; hardwired dynamic NAND address decoders; Logic gates; many-way associative cache; memory hierarchy; memristors; Memristors; memristors; NAND circuits; Network address translation; one-way associative TLB; physical address; physical addressing using virtual addresses; Programming; RAM; random access memories; Random access memory; random-access storage; read latency; resistive address decoder; resistive memory; TLB; virtual address; virtual addresses", number-of-cited-references = "8", research-areas = "Computer Science", times-cited = "2", unique-id = "Yavits:2017:RAD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Manivannan:2017:RAG, author = "Madhavan Manivannan and Miquel Peric{\`a}s and Vassilis Papaefstathiou and Per Stenstr{\"o}m", title = "Runtime-Assisted Global Cache Management for Task-Based Parallel Programs", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "145--148", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2606593", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Dead blocks are handled inefficiently in multi-level cache hierarchies because the decision as to whether a block is dead has to be taken locally at each cache level. This paper introduces runtime-assisted global cache management to quickly deem blocks dead across cache levels in the context of task-based parallel programs. The scheme is based on a cooperative hardware/software approach that leverages static and dynamic information about future data region reuse(s) available to runtime systems for task-based parallel programming models. We show that our proposed runtime-assisted global cache management approach outperforms previously proposed local dead-block management schemes for task-based parallel programs.", acknowledgement = ack-nhfb, affiliation = "Manivannan, M (Reprint Author), Chalmers Univ Technol, Dept Comp Sci \& Engn, S-41258 Gothenburg, Sweden. Manivannan, Madhavan; Pericas, Miquel; Papaefstathiou, Vassilis; Stenstrom, Per, Chalmers Univ Technol, Dept Comp Sci \& Engn, S-41258 Gothenburg, Sweden.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "madhavan@chalmers.se miquelp@chalmers.se vaspap@chalmers.se per.stenstrom@chalmers.se", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Swedish Foundation for Strategic Research (SSF) under SCHEME project [RIT10-0033]; European Research Council (ERC) under MECCA project [340328]", funding-text = "This research is supported by grants from the Swedish Foundation for Strategic Research (SSF) under the SCHEME project (RIT10-0033) and the European Research Council (ERC) under the MECCA project (contract 340328). The simulations were run on the resources provided by the Swedish National Infrastructure for Computing (SNIC) at C3SE.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache level; Cache memory; cache storage; Data models; dead blocks; dead-block management schemes; Multi-level cache hierarchies; multilevel cache hierarchies; Optimization; parallel programming; Parallel programming; parallel programming models; parallel programs; prediction; Predictive models; run-time system; Runtime; runtime systems; runtime-assisted global cache management; Semantics; storage management", keywords-plus = "REPLACEMENT; PREDICTION", number-of-cited-references = "20", oa = "Bronze", ORCID-numbers = "Stenstrom, Per/0000-0002-4280-3843", research-areas = "Computer Science", times-cited = "0", unique-id = "Manivannan:2017:RAG", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Perais:2017:SFM, author = "Arthur Perais and Andre Seznec", title = "Storage-Free Memory Dependency Prediction", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "149--152", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2628379", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Memory Dependency Prediction (MDP) is paramount to good out-of-order performance, but decidedly not trivial as a all instances of a given static load may not necessarily depend on all instances of a given static store. As a result, for a given load, MDP should predict the exact store instruction the load depends on, and not only whether it depends on an inflight store or not, i.e., ideally, prediction should not be binary. However, we first argue that given the high degree of sophistication of modern branch predictors, the fact that a given dynamic load depends on an inflight store can be captured using the binary prediction capabilities of the branch predictor, providing coarse MDP at zero storage overhead. Second, by leveraging hysteresis counters, we show that the precise producer store can in fact be identified. This embodiment of MDP yields performance levels that are on par with state-of-the-art, and requires less than 70 additional bits of storage over a baseline without MDP at all.", acknowledgement = ack-nhfb, affiliation = "Perais, A (Reprint Author), INRIA IRISA, F-35000 Rennes, France. Perais, Arthur; Seznec, Andre, INRIA IRISA, F-35000 Rennes, France.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "arthur.perais@inria.fr andre.seznec@inria.fr", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "binary prediction capabilities; branch prediction space-efficiency; branch predictor; cache storage; coarse MDP; instruction sets; MDP yields performance levels; Memory dependency prediction; memory dependency prediction; Memory management; modern branch predictors; Out of order; out-of-order performance; precise producer store; Predictive models; storage management; storage-free memory dependency prediction; zero storage overhead", keywords-plus = "COMMUNICATION; QUEUE", number-of-cited-references = "14", research-areas = "Computer Science", times-cited = "1", unique-id = "Perais:2017:SFM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Mirhosseini:2017:SPB, author = "Amirhossein Mirhosseini and Aditya Agrawal and Josep Torrellas", title = "{Survive}: Pointer-Based In-{DRAM} Incremental Checkpointing for Low-Cost Data Persistence and Rollback-Recovery", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "153--157", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2646340", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper introduces the Survive DRAM architecture for effective in-memory micro-checkpointing. Survive implements low-cost incremental checkpointing, enabling fast rollback that can be used in various architectural techniques such as speculation, approximation, or low voltage operation. Survive also provides crash consistency when used as the frontend of a hybrid DRAM-NVM memory system. This is accomplished by carefully copying the incremental checkpoints generated in the DRAM frontend to the NVM backend. Simulations show that Survive only imposes an average 3.5 percent execution time overhead over an unmodified DRAM main-memory system with no checkpointing, while reducing the number of NVM writes by 89 percent over an NVM-only main-memory system.", acknowledgement = ack-nhfb, affiliation = "Mirhosseini, A (Reprint Author), Univ Michigan, Ann Arbor, MI 48109 USA. Mirhosseini, Amirhossein, Univ Michigan, Ann Arbor, MI 48109 USA. Agrawal, Aditya, NVIDIA Corp, Santa Clara, CA 95050 USA. Torrellas, Josep, Univ Illinois, Champaign, IL 61801 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "miramir@umich.edu adityaa@nvidia.com torrella@illinois.edu", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "architectural techniques; checkpointing; Checkpointing; checkpointing; Computer architecture; Computer crashes; DRAM chips; hybrid DRAM-NVM memory system; In-DRAM incremental checkpointing; in-memory microcheckpointing; incremental checkpoints; low voltage operation; low-cost data persistence; low-cost incremental checkpointing; memory architecture; Non-volatile memory; Nonvolatile memory; NVM-only main-memory system; Random access memory; random-access storage; reliability; rollback-recovery; software fault tolerance; survive DRAM architecture; system recovery; Transistors; unmodified DRAM main-memory system", keywords-plus = "PHASE-CHANGE MEMORY", number-of-cited-references = "21", ORCID-numbers = "Mirhosseini, Amirhossein/0000-0001-6501-6087", research-areas = "Computer Science", times-cited = "5", unique-id = "Mirhosseini:2017:SPB", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Pinto:2017:TTA, author = "Sandro Pinto and Jorge Pereira and Tiago Gomes and Mongkol Ekpanyapong and Adriano Tavares", title = "Towards a {TrustZone}-Assisted Hypervisor for Real-Time Embedded Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "158--161", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2617308", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Virtualization technology starts becoming more and more widespread in the embedded space. The penalties incurred by standard software-based virtualization is pushing research towards hardware-assisted solutions. Among the existing commercial off-the-shelf technologies for secure virtualization, ARM TrustZone is attracting particular attention. However, it is often seen with some scepticism due to the dual-OS limitation of existing state-of-the-art solutions. This letter presents the implementation of a TrustZone-based hypervisor for real-time embedded systems, which allows multiple RTOS partitions on the same hardware platform. The results demonstrate that virtualization overhead is less than 2 percent for a 10 milliseconds guest-switching rate, and the system remains deterministic. This work goes beyond related work by implementing a TrustZone-assisted solution that allows the execution of an arbitrary number of guest OSes while providing the foundation to drive next generation of secure virtualization solutions for resource-constrained embedded devices.", acknowledgement = ack-nhfb, affiliation = "Pinto, S (Reprint Author), Univ Minho, Dept Ctr Algoritmi, P-4704553 Braga, Portugal. Pinto, Sandro; Pereira, Jorge; Gomes, Tiago; Tavares, Adriano, Univ Minho, Dept Ctr Algoritmi, P-4704553 Braga, Portugal. Ekpanyapong, Mongkol, Asian Inst Technol, Pathum Thani 12120, Thailand.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "sandro.pinto@algoritmi.uminho.pt jorge.m.pereira@algoritmi.uminho.pt tiago.m.gomes@algoritmi.uminho.pt mongkol@ait.ac.th adriano.tavares@algoritmi.uminho.pt", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "COMPETE [POCI-01-0145-FEDER-007043]; FCT - Fundacao para a Ciencia e Tecnologia [SFRH/BD/91530/2012, UID/CEC/00319/2013]", funding-text = "This work has been supported by COMPETE: POCI-01-0145-FEDER-007043 and FCT --- Fundacao para a Ciencia e Tecnologia (grant SFRH/BD/91530/2012 and UID/CEC/00319/2013).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "ARM; ARM TrustZone; dual-OS limitation; embedded space; embedded systems; Embedded systems; embedded systems; hardware platform; hardware-assisted solutions; monitor; Monitoring; multiple RTOS partitions; operating systems (computers); Program processors; real-time; real-time embedded systems; Real-time systems; RODOS; secure virtualization solutions; security of data; standard software; trusted computing; TrustZone; TrustZone-assisted solution; Virtual machine monitors; virtualisation; Virtualization; virtualization overhead; virtualization technology", number-of-cited-references = "12", ORCID-numbers = "Gomes, Tiago/0000-0002-4071-9015 Salgado Pinto, Sandro Emanuel/0000-0003-4580-7484 Tavares, Adriano/0000-0001-8316-6927", research-areas = "Computer Science", researcherid-numbers = "Gomes, Tiago/A-4751-2016 Salgado Pinto, Sandro Emanuel/D-6725-2015 Tavares, Adriano/M-5257-2013", times-cited = "3", unique-id = "Pinto:2017:TTA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Carlson:2017:THL, author = "Trevor E. Carlson and Kim-Anh Tran and Alexandra Jimborean and Konstantinos Koukos and Magnus Sj{\"a}lander and Stefanos Kaxiras", title = "Transcending Hardware Limits with Software Out-of-Order Processing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "162--165", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2672559", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Building high-performance, next-generation processors require novel techniques to enable improved performance given today's power-and energy-efficiency requirements. Additionally, a widening gap between processor and memory performance makes it even more difficult to improve efficiency with conventional techniques. While out-of-order architectures attempt to hide this memory latency with dynamically reordered instructions, they lack the energy efficiency seen in in-order processors. Thus, our goal is to reorder the instruction stream to avoid stalls and improve utilization for energy efficiency and performance. To accomplish this goal, we propose an enhanced stall-on-use in-order core that improves energy efficiency (and therefore performance in these power-limited designes) through out-of-program-order execution. During long latency loads, the Software Out-of-Order Processing (SWOOP) core exposes additional memory-and instruction-level parallelism to perform useful, non-speculative work. The resulting instruction lookahead of the SWOOP core reaches beyond the conventional fixed-sized processor structures with the help of transparent hardware register contexts. Our results show that SWOOP demonstrates a 34 percent performance improvement on average compared with an in-order, stall-on-use core, with an energy reduction of 23 percent.", acknowledgement = ack-nhfb, affiliation = "Carlson, TE (Reprint Author), Uppsala Univ, S-75236 Uppsala, Sweden. Carlson, Trevor E.; Tran, Kim-Anh; Jimborean, Alexandra; Koukos, Konstantinos; Sjalander, Magnus; Kaxiras, Stefanos, Uppsala Univ, S-75236 Uppsala, Sweden. Sjalander, Magnus, Norwegian Univ Sci \& Technol NTNU, N-7491 Trondheim, Norway.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "trevor.carlson@it.uu.se kim-anh.tran@it.uu.se alexandra.jimborean@it.uu.se konstantinos.koukos@it.uu.se magnus.sjalander@idi.ntnu.no stefanos.kaxiras@it.uu.se", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Compilation; Context awareness; decoupled access-execute; dynamically reordered instructions; energy; energy conservation; energy efficiency; Energy management; energy reduction; energy-efficiency requirements; enhanced stall-on-use; fixed-sized processor structures; hardware limits; in-order core; in-order processors; instruction stream; instruction-level parallelism; memory level parallelism; microprocessor chips; next-generation processors; Out of order; out-of-program-order execution; parallel architectures; power-limited designes; Prefetching; resulting instruction lookahead; software out-of-order processing; stall-on-use core; SWOOP", number-of-cited-references = "9", ORCID-numbers = "Sjalander, Magnus/0000-0003-4232-6976 Jimborean, Alexandra/0000-0001-8642-2447", research-areas = "Computer Science", researcherid-numbers = "Sjalander, Magnus/N-5995-2019", times-cited = "0", unique-id = "Carlson:2017:THL", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Ahmadvand:2017:UDV, author = "Hossein Ahmadvand and Maziar Goudarzi", title = "Using Data Variety for Efficient Progressive Big Data Processing in Warehouse-Scale Computers", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "166--169", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2636293", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Warehouse Scale Computers (WSC) are often used for various big data jobs where the big data under processing comes from a variety of sources. We show that different data portions, from the same or different sources, have different significances in determining the final outcome of the computation, and hence, by prioritizing them and assigning more resources to processing of more important data, the WSC can be used more efficiently in terms of time as well as cost. We provide a simple low-overhead mechanism to quickly assess the significance of each data portion, and show its effectiveness in finding the best ranking of data portions. We continue by demonstrating how this ranking is used in resource allocation to improve time and cost by up to 24 and 9 percent respectively, and also discuss other uses of this ranking information, e.g., in faster progressive approximation of the final outcome of big data job without processing entire data, and in more effective use of renewable energies in WSCs.", acknowledgement = ack-nhfb, affiliation = "Ahmadvand, H (Reprint Author), Sharif Univ Technol, Dept Comp Engn, Azadi Ave, Tehran 1136511155, Iran. Ahmadvand, Hossein; Goudarzi, Maziar, Sharif Univ Technol, Dept Comp Engn, Azadi Ave, Tehran 1136511155, Iran.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "ahmadvand@ce.sharif.edu goudarzi@sharif.edu", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Sharif University of Technology [G930826]", funding-text = "This research is supported by grant number G930826 from Sharif University of Technology. We are grateful for their support.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; Big data; Big Data; Big data; Computers; data warehouses; Distributed databases; efficiency; efficient progressive Big Data processing; order of processing; resource allocation; Resource management; sampling; warehouse-scale computers; WSC", number-of-cited-references = "16", research-areas = "Computer Science", times-cited = "1", unique-id = "Ahmadvand:2017:UDV", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Zhang:2017:WDP, author = "Dan Zhang and Xiaoyu Ma and Derek Chiou", title = "Worklist-Directed Prefetching", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "170--173", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2627571", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Researchers have demonstrated the benefits of hardware worklist accelerators, which offload scheduling and load balancing operations in parallel graph applications. However, many of these applications are still heavily memory latency-bound due to the irregular nature of graph data structure access patterns. We utilize the fact that the accelerator has knowledge of upcoming work items to accurately issue prefetch requests, a technique we call worklist-directed prefetching. A credit-based system to improve prefetch timeliness and prevent cache thrashing is proposed. The proposed prefetching scheme is simulated on a 64-core CMP with a hardware worklist accelerator on several graph algorithms and inputs. Enabling worklist-directed prefetching into the L2 cache results in an average speedup of 1.99, and up to 2.35 on Breadth-First Search.", acknowledgement = ack-nhfb, affiliation = "Zhang, D (Reprint Author), Univ Texas Austin, Dept Elect \& Comp Engn, Austin, TX 78712 USA. Zhang, Dan; Ma, Xiaoyu; Chiou, Derek, Univ Texas Austin, Dept Elect \& Comp Engn, Austin, TX 78712 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "dan.zhang@utexas.edu xma@utexas.edu derek@ece.utexas.edu", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "accelerators; cache storage; data structures; graph algorithms; graph data structure access patterns; graph problems; graph theory; Hardware; hardware worklist accelerator; load-balancing operations; microprocessor chips; parallel graph applications; parallel processors; Prefetching; prefetching researchers; prefetching scheme; Processor scheduling; resource allocation; scheduling; Software algorithms; storage management", keywords-plus = "ARCHITECTURAL SUPPORT; ALGORITHM", number-of-cited-references = "23", research-areas = "Computer Science", times-cited = "0", unique-id = "Zhang:2017:WDP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Scionti:2018:EMM, author = "Alberto Scionti and Somnath Mazumdar and Stephane Zuckerman", title = "Enabling Massive Multi-Threading with Fast Hashing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "1--4", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2697863", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The next generation of high-performance computers is expected to execute threads in orders of magnitude higher than today's systems. Improper management of such huge amount of threads can create resource contention, leading to overall degraded system performance. By leveraging more practical approaches to distribute threads on the available resources, execution models and manycore chips are expected to overcome limitations of current systems. Here, we present DELTA --- a Data-Enabled muLti-Threaded Architecture, where a producer-consumer scheme is used to execute threads via complete distributed thread management mechanism. We consider a manycore tiled-chip architecture where Network-on-Chip (NoC) routers are extended to support our execution model. The proposed extension is analysed, while simulation results confirm that DELTA can manage a large number of simultaneous threads, relying on a simple hardware structure.", acknowledgement = ack-nhfb, affiliation = "Scionti, A (Reprint Author), ISMB, I-10138 Turin, Italy. Scionti, Alberto, ISMB, I-10138 Turin, Italy. Mazumdar, Somnath, Univ Siena, Siena, SI, Italy. Zuckerman, Stephane, Michigan Technol Univ, Houghton, MI 49931 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "scionti@ismb.it mazumdar@dii.unisi.it szuckerm@mtu.edu", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "complete distributed thread management mechanism; Computational modeling; Computer architecture; data-enabled multithreaded architecture; Dataflow; degraded system performance; DELTA; execution model; fast hashing; Hardware; hashing; high-performance computers; Instruction sets; manycore chips; manycore tiled-chip architecture; massive multihreading; microprocessor chips; multi-threading; multiprocessing systems; network-on-chip; network-on-chip routers; Organizations; producer-consumer scheme; Programming; resource contention; Scheduling; simultaneous threads; thread-scheduling", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "1", unique-id = "Scionti:2018:EMM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2018:IIC, author = "Anonymous", title = "2017 Index {{\booktitle{IEEE Computer Architecture Letters}}} Vol. 16", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "1--6", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2799560", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Jeon:2018:HMP, author = "Dong-Ik Jeon and Kyeong-Bin Park and Ki-Seok Chung", title = "{HMC-MAC}: Processing-in Memory Architecture for Multiply--Accumulate Operations with Hybrid Memory Cube", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "5--8", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2700298", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Many studies focus on implementing processing-in memory (PIM) on the logic die of the hybrid memory cube (HMC) architecture. The multiply-accumulate (MAC) operation is heavily used in digital signal processing (DSP) systems. In this paper, a novel PIM architecture called HMC-MAC that implements the MAC operation in the HMC is proposed. The vault controllers of the conventional HMC are working independently to maximize the parallelism, and HMC-MAC is based on the conventional HMC without modifying the architecture much. Therefore, a large number of MAC operations can be processed in parallel. In HMC-MAC, the MAC operation can be carried out simultaneously with as much as 128 KB data. The correctness on HMC-MAC is verified by simulations, and its performance is better than the conventional CPU-based MAC operation when the MAC operation is consecutively executed at least six times", acknowledgement = ack-nhfb, affiliation = "Chung, KS (Reprint Author), Hanyang Univ, Dept Elect \& Comp Engn, Seoul 04763, South Korea. Jeon, Dong-Ik; Park, Kyeong-Bin; Chung, Ki-Seok, Hanyang Univ, Dept Elect \& Comp Engn, Seoul 04763, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "estwingz@naver.com lay1523@naver.com kchung@hanyang.ac.kr", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Basic Science Research Program through the National Research Foundation of Korea(NRF) --- Ministry of Education [NRF-2015R1D1A1A09061079]", funding-text = "This research was supported by Basic Science Research Program through the National Research Foundation of Korea(NRF) funded by the Ministry of Education (NRF-2015R1D1A1A09061079).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computers; CPU-based MAC operation; digital signal processing; digital signal processing systems; DRAM chips; DSP systems; Electronic mail; HMC-MAC; hybrid memory cube architecture; logic circuits; logic die; memory architecture; Memory architecture; Memory management; memory size 128.0 KByte; Memory structures; memory used as logic; multiple data stream architectures; multiply-accumulate operation; parallel processing; processing-in memory architecture; Random access memory; Registers; vault controllers", number-of-cited-references = "11", ORCID-numbers = "Jeon, Dong-Ik/0000-0002-8572-4184", research-areas = "Computer Science", times-cited = "0", unique-id = "Jeon:2018:HMP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{VandenSteen:2018:MSP, author = "Sam {Van den Steen} and Lieven Eeckhout", title = "Modeling Superscalar Processor Memory-Level Parallelism", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "9--12", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2701370", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper proposes an analytical model to predict Memory-Level Parallelism (MLP) in a superscalar processor. We profile the workload once and measure a set of distributions to characterize the workload's inherent memory behavior. We subsequently generate a virtual instruction stream, over which we then process an abstract MLP model to predict MLP for a particular micro-architecture with a given ROB size, LLC size, MSHR size and stride-based prefetcher. Experimental evaluation reports an improvement in modeling error from 16.9 percent for previous work to 3.6 percent on average for the proposed model.", acknowledgement = ack-nhfb, affiliation = "Van den Steen, S (Reprint Author), Univ Ghent, Ghent, Belgium. Van den Steen, Sam; Eeckhout, Lieven, Univ Ghent, Ghent, Belgium.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "sam.vandensteen@ugent.be lieven.eeckhout@ugent.be", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Agency for Innovation by Science and Technology (IWT)", funding-text = "We thank the anonymous reviewers for their constructive and insightful feedback. Sam Van den Steen is supported through a doctoral fellowship by the Agency for Innovation by Science and Technology (IWT).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Analytical models; Computational modeling; Computer architecture; Hardware; LLC size; Load modeling; memory architecture; memory level parallelism (MLP); micro-architecture; MLP model; Modeling; MSHR size; Predictive models; Prefetching; ROB size; superscalar processor memory-level parallelism modeling; virtual instruction stream", number-of-cited-references = "11", ORCID-numbers = "Van den Steen, Sam/0000-0003-3630-2214", research-areas = "Computer Science", times-cited = "0", unique-id = "denSteen:2018:MSP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Durkovic:2018:BNS, author = "Srdjan Durkovic and Zoran Cica", title = "{Birkhoff--von Neumann} Switch Based on Greedy Scheduling", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "13--16", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2707082", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "It is important to develop high performance packet switches that are highly scalable. Among the popular solutions are input queued (IQ) switches and load balanced Birkhoff-von Neumann (LB-BvN) switches. However, both solutions have their drawbacks. Switch configuration pattern in IQ switches is random which can limit the supported port speed. On the other hand, LB-BvN switches require two switching stages which increase the overall cost. Also, some LB-BvN solutions suffer from the packet out of sequence problem. In this paper, we propose a novel packet switch architecture that combines the best properties of the IQ and LB-BvN switches and eliminates their drawbacks.", acknowledgement = ack-nhfb, affiliation = "Cica, Z (Reprint Author), Univ Belgrade, Sch Elect Engn, Belgrade 11120, Serbia. Durkovic, Srdjan; Cica, Zoran, Univ Belgrade, Sch Elect Engn, Belgrade 11120, Serbia.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "srdjad6@gmail.com zoran.cica@etf.rs", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Architecture; data communications; Delays; greedy scheduling; high performance packet; Internet; IP networks; IQ switches; LB-BvN solutions; LB-BvN switches; load balanced Birkhoff-von Neumann switches; packet switch architecture; packet switching; packet-switching networks; Ports (Computers); queueing theory; Random access memory; resource allocation; routers; Scheduling; switch configuration pattern; Switches; switching stages; telecommunication scheduling", keywords-plus = "2-STAGE SWITCHES; DESIGN; ALGORITHM", number-of-cited-references = "9", research-areas = "Computer Science", times-cited = "1", unique-id = "Durkovic:2018:BNS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Pham:2018:TSM, author = "Binh Pham and Derek Hower and Abhishek Bhattacharjee and Trey Cain", title = "{TLB} Shootdown Mitigation for Low-Power Many-Core Servers with {L1} Virtual Caches", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "17--20", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2712140", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Power efficiency has become one of the most important design constraints for high-performance systems. In this paper, we revisit the design of low-power virtually-addressed caches. While virtually-addressed caches enable significant power savings by obviating the need for Translation Lookaside Buffer (TLB) lookups, they suffer from several challenging design issues that curtail their widespread commercial adoption. We focus on one of these challenges-cache flushes due to virtual page remappings. We use detailed studies on an ARM many-core server to show that this problem degrades performance by up to 25 percent for a mix of multi-programmed and multi-threaded workloads. Interestingly, we observe that many of these flushes are spurious, and caused by an indiscriminate invalidation broadcast on ARM architecture. In response, we propose a low-overhead and readily implementable hardware mechanism using bloom filters to reduce spurious invalidations and mitigate their ill effects.", acknowledgement = ack-nhfb, affiliation = "Pham, B (Reprint Author), Rutgers State Univ, Dept Comp Sci, Piscataway, NJ 08854 USA. Binh Pham; Bhattacharjee, Abhishek, Rutgers State Univ, Dept Comp Sci, Piscataway, NJ 08854 USA. Hower, Derek, Qualcomm Technol Inc, Piscataway, NJ 08854 USA. Cain, Trey, Qualcomm Datactr Technol Inc, Piscataway, NJ 08854 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "binhpham@rutgers.edu dhower@qti.qualcomm.com abhib@rutgers.edu tcain@qti.qualcomm.com", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "ARM many-core server; Benchmark testing; bloom filters; cache flushes; cache storage; Coherence; Computer architecture; design constraints; Hardware; high-performance systems; Indexes; L1 virtual caches; low-overhead; low-power many-core servers; low-power virtually-addressed caches; microprocessor chips; multi-threading; multicores; multiprocessing systems; multiprogrammed workloads; multiprogramming; multithreaded workloads; multithreading; power efficiency; power savings; Registers; Servers; TLB; TLB shootdown mitigation; Virtual Cache; virtual memory; virtual page remappings", number-of-cited-references = "21", research-areas = "Computer Science", times-cited = "0", unique-id = "Pham:2018:TSM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Yavits:2018:ASM, author = "Leonid Yavits and Ran Ginosar", title = "Accelerator for Sparse Machine Learning", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "21--24", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2714667", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Sparse matrix by vector multiplication (SpMV) plays a pivotal role in machine learning and data mining. We propose and investigate an SpMV accelerator, specifically designed to accelerate the sparse matrix by sparse vector multiplication (SpMSpV), and to be integrated in a CPU core. We show that our accelerator outperforms a similar solution by 70x while achieving 8x higher power efficiency, which yields an estimated 29x energy reduction for SpMSpV based applications.", acknowledgement = ack-nhfb, affiliation = "Yavits, L (Reprint Author), Technion Israel Inst Technol, Dept Elect Engn, IL-3200000 Haifa, Israel. Yavits, Leonid; Ginosar, Ran, Technion Israel Inst Technol, Dept Elect Engn, IL-3200000 Haifa, Israel.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "yavits@technion.ac.il ran@ee.technion.ac.il", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Acceleration; accelerator; Algorithm design and analysis; CPU core; data mining; Indexes; learning (artificial intelligence); matrix multiplication; Memory management; microprocessor chips; power aware computing; power efficiency; Random access memory; regression analysis; sparse machine learning; sparse matrices; Sparse matrices; sparse matrix; sparse matrix by sparse vector multiplication; Sparse matrix multiplication; sparse vector multiplication; SpMSpV based applications; SpMV; SpMV accelerator; tree searching; vectors", keywords-plus = "MATRIX-VECTOR MULTIPLICATION", number-of-cited-references = "14", research-areas = "Computer Science", times-cited = "1", unique-id = "Yavits:2018:ASM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Christoforidis:2018:CTC, author = "Eleftherios-Iordanis Christoforidis and Sotirios Xydis and Dimitrios Soudris", title = "{CF-TUNE}: Collaborative Filtering Auto-Tuning for Energy Efficient Many-Core Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "25--28", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2716919", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Energy efficiency is considered today as a first class design principle of modern many-core computing systems in the effort to overcome the limited power envelope. However, many-core processors are characterised by high micro-architectural complexity, which is propagated up to the application level affecting both performance and energy consumption. In this paper, we present CF-TUNE, an online and scalable auto-tuning framework for energy aware applications mapping on emerging many-core architectures. CF-TUNE enables the extraction of an energy-efficient tuning configuration point with minimal application characterisation on the whole tuning configuration space. Instead of analyzing every application against every tuning configuration, it adopts a collaborative filtering technique that quickly and with high accuracy configures the application's tuning parameters by identifying similarities with previously optimized applications. We evaluate CF-TUNE's efficiency against a set of demanding and diverse applications mapped on Intel Many Integrated Core processor and we show that with minimal characterization, e.g., only either two or four evaluations, CF-TUNE recommends a tuning configuration that performs at least at the 94 percent level of the optimal one.", acknowledgement = ack-nhfb, affiliation = "Xydis, S (Reprint Author), Natl Tech Univ Athens, Sch Elect \& Comp Engn, Zografos 15780, Greece. Christoforidis, Eleftherios-Iordanis; Xydis, Sotirios; Soudris, Dimitrios, Natl Tech Univ Athens, Sch Elect \& Comp Engn, Zografos 15780, Greece.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "eleftherios.christoforidis@gmail.com sxydis@microlab.ntua.gr dsoudris@microlab.ntua.gr", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "application level; application tuning parameters; Auto-tuning; CF-TUNE efficiency; Collaboration; collaborative filtering auto-tuning; Computer architecture; design space exploration; energy aware application mapping; energy conservation; energy consumption; energy efficient computing; energy efficient many-core processors; energy-efficient tuning configuration point; Instruction sets; Intel many integrated core processor; Intel MIC; machine learning; many-core architectures; manycore architectures; microarchitectural complexity; microprocessor chips; Microwave integrated circuits; minimal application characterisation; modern many-core computing systems; multiprocessing systems; online auto-tuning framework; Optimization; power aware computing; power envelope; scalable auto-tuning framework; Tuning; tuning configuration space", number-of-cited-references = "15", ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847", research-areas = "Computer Science", researcherid-numbers = "Soudris, Dimitrios/O-8843-2019", times-cited = "0", unique-id = "Christoforidis:2018:CTC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Almatrood:2018:DGP, author = "Amjad F. Almatrood and Harpreet Singh", title = "Design of Generalized Pipeline Cellular Array in Quantum-Dot Cellular Automata", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "29--32", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2719021", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Cellular arrays have been the topic of interest in computer arithmetic and architecture for the last four decades. In this letter, an overall quantum-dot cellular automata (QCA) design for a generalized pipeline cellular array is presented. QCA is one of the promising emerging nanotechnologies that are being considered as possible alternatives to complementary metal-oxide semiconductor technology due to the physical limitations of CMOS. The QCA designs for arithmetic cell and control cell used in the pipeline array are discussed in detail. The equivalent majority logic networks to these cells are generated using the best existing majority logic synthesis method in order to obtain the optimal majority networks which require fewer QCA cells and clock zones compared to other synthesis methods. The proposed array can perform all the basic arithmetic operations such as squaring, square rooting, multiplication, division, etc., which could be quite valuable in considering future large-scale QCA designs.", acknowledgement = ack-nhfb, affiliation = "Almatrood, AF (Reprint Author), Wayne State Univ, Dept Elect \& Comp Engn, Detroit, MI 48202 USA. Almatrood, Amjad F.; Singh, Harpreet, Wayne State Univ, Dept Elect \& Comp Engn, Detroit, MI 48202 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "amjad.almatrood@wayne.edu hsingh@eng.wayne.edu", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "arithmetic cell; Arithmetic processor; cellular arrays; cellular automata; clock zones; Clocks; clocks; CMOS logic circuits; CMOS technology; complementary metal-oxide semiconductor technology; computer architecture; Computer architecture; computer arithmetic; control cell; Delays; equivalent majority logic networks; generalized pipeline cellular array design; large-scale QCA designs; Logic arrays; logic design; Logic gates; majority logic; majority logic synthesis method; Microprocessors; nanoelectronics; nanotechnologies; pipeline array; Pipelines; quantum-dot cellular automata (QCA); quantum-dot cellular automata design", number-of-cited-references = "16", research-areas = "Computer Science", times-cited = "2", unique-id = "Almatrood:2018:DGP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Zha:2018:CRC, author = "Yue Zha and Jing Li", title = "{CMA}: a Reconfigurable Complex Matching Accelerator for Wire-Speed Network Intrusion Detection", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "33--36", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2719023", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The rapid growth in network bandwidth and the ever more sophisticated network attack techniques pose challenges to current network intrusion detection systems (NIDS). While software-based solutions are incapable of performing wire-speed network traffic monitoring, many hardware-based pattern matching solutions also suffer from capacity limitation and high power consumption. To effectively address these challenges, we propose a reconfigurable complex matching accelerator (CMA) enabled by the emerging nonvolatile memory technology (resistive random access memory) to speed up intrusion detection systems with better energy efficiency. Beyond common equality matching in current NIDS, CMA can be configured to provide a comprehensive set of arithmetic matching functions (e.g., less than), resulting in improved utilization and higher energy efficiency. We evaluate CMA using real-world network security benchmarks. On average, it achieves 84.9 percent area reduction, 97.3 percent energy consumption reduction, and 20 percent improvement in searching speed compared to the SRAM-based Ternary Content Addressable Memory (TCAM) design in state-of-the-art NIDS. It also outperforms emerging RRAM-based TCAM (2.5T1R) design in area, energy and search delay, on the set of evaluated workloads.", acknowledgement = ack-nhfb, affiliation = "Zha, Y (Reprint Author), Univ Wisconsin, Elect \& Comp Engn, Madison, WI 53706 USA. Zha, Yue; Li, Jing, Univ Wisconsin, Elect \& Comp Engn, Madison, WI 53706 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "yzha3@wisc.edu jli587@wisc.edu", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "accelerator; arithmetic matching functions; CMA; Computer architecture; computer network security; computer networks; content-addressable storage; Coprocessors; emerging nonvolatile memory technology; Encoding; energy consumption reduction; higher energy efficiency; intrusion detection; Intrusion detection; IP networks; network bandwidth; network intrusion detection systems; Network security; NIDS; pattern matching; pattern matching solutions; Ports (Computers); random-access storage; real-world network security benchmarks; reconfigurable complex matching accelerator; ReRAM; resistive random access memory; security of data; sophisticated network attack techniques; SRAM chips; TCAM; telecommunication traffic; ternary content addressable memory design; wire-speed network intrusion detection; wire-speed network traffic monitoring", keywords-plus = "PACKET CLASSIFICATION; MODEL", number-of-cited-references = "15", research-areas = "Computer Science", times-cited = "0", unique-id = "Zha:2018:CRC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Jung:2018:SMS, author = "Myoungsoo Jung and Jie Zhang and Ahmed Abulila and Miryeong Kwon and Narges Shahidi and John Shalf and Nam Sung Kim and Mahmut Kandemir", title = "{SimpleSSD}: Modeling Solid State Drives for Holistic System Simulation", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "37--41", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2750658", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Existing solid state drive (SSD) simulators unfortunately lack hardware and/or software architecture models. Consequently, they are far from capturing the critical features of contemporary SSD devices. More importantly, while the performance of modern systems that adopt SSDs can vary based on their numerous internal design parameters and storage-level configurations, a full system simulation with traditional SSD models often requires unreasonably long runtimes and excessive computational resources. In this work, we propose SimpleSSD, a high-fidelity simulator that models all detailed characteristics of hardware and software, while simplifying the nondescript features of storage internals. In contrast to existing SSD simulators, SimpleSSD can easily be integrated into publicly-available full system simulators. In addition, it can accommodate a complete storage stack and evaluate the performance of SSDs along with diverse memory technologies and microarchitectures. Thus, it facilitates simulations that explore the full design space at different levels of system abstraction.", acknowledgement = ack-nhfb, affiliation = "Jung, M (Reprint Author), Yonsei Univ, Comp Architecture \& Memory Syst Lab, Seoul 03722, South Korea. Jung, Myoungsoo; Zhang, Jie; Kwon, Miryeong, Yonsei Univ, Comp Architecture \& Memory Syst Lab, Seoul 03722, South Korea. Abulila, Ahmed; Kim, Nam Sung, Univ Illinois, Champaign, IL 61820 USA. Shahidi, Narges; Kandemir, Mahmut, Penn State Univ, State Coll, PA 16801 USA. Shalf, John, Lawrence Berkeley Natl Lab, Berkeley, CA 94720 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "m.jung@yonsei.ac.kr jie@yonsei.ac.kr abulila2@illinois.edu mkwon@camelab.org nxs314@psu.edu jshalf@lbl.gov nskim@illinois.edu kandemir@cse.psu.edu", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NRF [2016R1C1B2015312]; Mem-Ray grant [2015-11-1731]; US National Science Foundation [1640196, 1439021, 1439057, 1409095, 1626251, 1629915, 1629129, 1526750]; SRC/NRC NERC [2016-NE-2697-A]; [IITP-2017-2017-0-01015]; [NRF-2015M3C4A7065645]; [DOE DE-AC02-05CH 11231]", funding-text = "This research is mainly supported by NRF 2016R1C1B2015312. This work is also supported in part by IITP-2017-2017-0-01015, NRF-2015M3C4A7065645, DOE DE-AC02-05CH 11231, and Mem-Ray grant (2015-11-1731). Dr. Kim is supported in part by US National Science Foundation 1640196 and SRC/NRC NERC 2016-NE-2697-A. Dr. Kandemir is supported in part by US National Science Foundation grants 1439021, 1439057, 1409095, 1626251, 1629915, 1629129 and 1526750.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "complete storage stack; computational modeling; Computational modeling; computational modeling; computer architecture; Computer architecture; contemporary SSD devices; flash memories; Hardware; high-fidelity simulator; internal design parameters; microprocessors; Microprocessors; microprocessors; nondescript features; parallel processing; Parallel processing; parallel processing; publicly-available full system simulators; SimpleSSD; software; Software; software; solid state drive simulators; SSD simulators; storage-level configurations; system abstraction; system simulation; systems simulation; Systems simulation; systems simulation", number-of-cited-references = "14", research-areas = "Computer Science", researcherid-numbers = "Jung, Myoungsoo/F-4565-2019", times-cited = "2", unique-id = "Jung:2018:SMS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Chowdhury:2018:EMP, author = "Zamshed Chowdhury and Jonathan D. Harms and S. Karen Khatamifard and Masoud Zabihi and Yang Lv and Andrew P. Lyle and Sachin S. Sapatnekar and Ulya R. Karpuzcu and Jian-Ping Wang", title = "Efficient In-Memory Processing Using Spintronics", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "42--46", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2751042", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "As the overhead of data retrieval becomes forbidding, bringing processor logic to the memory where the data reside becomes more energy-efficient. While traditional CMOS structures are unsuited to the tight integration of logic and memory, emerging spintronic technologies show remarkable versatility. This paper introduces a novel spintronics-based processing-in-memory (PIM) framework called computational RAM (CRAM) to solve data-intensive computing problems.", acknowledgement = ack-nhfb, affiliation = "Chowdhury, Z (Reprint Author), Univ Minnesota, Dept Elect \& Comp Engn, Minneapolis, MN 55455 USA. Chowdhury, Zamshed; Harms, Jonathan D.; Khatamifard, S. Karen; Zabihi, Masoud; Lv, Yang; Lyle, Andrew P.; Sapatnekar, Sachin S.; Karpuzcu, Ulya R.; Wang, Jian-Ping, Univ Minnesota, Dept Elect \& Comp Engn, Minneapolis, MN 55455 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "chowh005@umn.edu harms074@umn.edu khatami@umn.edu zabih003@umn.edu lvxxx057@umn.edu czamshediqbal@gmail.com sachin@umn.edu ukarpuzc@umn.edu jpwang@umn.edu", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "DARPA Non-Volatile Logic program; NSF SPX [1725420]; by C-SPIN, one of the six SRC STARnet Centers; MARCO; DARPA", funding-text = "This work is supported by DARPA Non-Volatile Logic program, NSF SPX grant no. 1725420, and by C-SPIN, one of the six SRC STARnet Centers, sponsored by MARCO and DARPA. Chowdhury and Harms equally contributed to this work.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Adders; computational RAM; CRAM; data retrieval; data-intensive computing problems; Efficient In-Memory Processing; energy-efficiency; Logic arrays; Logic gates; Magnetic tunneling; magnetoelectronics; Memory management; MRAM devices; MTJ; PIM framework; processing-in-memory; processing-in-memory framework; processor logic; Random access memory; spintronic technologies; spintronics; STT-MRAM; traditional CMOS structures", keywords-plus = "UNIVERSAL MEMORY; LOGIC", number-of-cited-references = "25", ORCID-numbers = "Sapatnekar, Sachin/0000-0002-5353-2364", research-areas = "Computer Science", times-cited = "4", unique-id = "Chowdhury:2018:EMP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Ajdari:2018:SHB, author = "Mohammadamin Ajdari and Pyeongsu Park and Dongup Kwon and Joonsung Kim and Jangwoo Kim", title = "A Scalable {HW}-Based Inline Deduplication for {SSD} Arrays", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "47--50", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2753258", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "SSD arrays are becoming popular in modern storage servers as a primary storage, and they aim to reduce the high cost of the devices by performing inline deduplications. Unfortunately, existing software-based inline deduplications cannot achieve the devices' maximum throughput due to their high CPU utilization and power overhead. A recently proposed approach to perform device-wide deduplications inside each SSD can distribute the CPU overhead among multiple SSDs, but it also suffers from severely decreasing deduplication opportunities with the increasing number of SSDs deployed per node. Therefore, we propose a node-wide deduplication engine that relies on specialized hardware to perform two key steps of deduplication; data signature generation and table management. Our FPGA-based prototype detects all duplicates, and compared to software-based inline deduplication, it reduces the overall CPU utilization and power consumption by 93.6 and similar to 20 percent respectively for a slow baseline and more for faster baselines.", acknowledgement = ack-nhfb, affiliation = "Kim, J (Reprint Author), Seoul Natl Univ, Dept Elect \& Comp Engn, Seoul 08826, South Korea. Ajdari, Mohammadamin, POSTECH, Dept Comp Sci \& Engn, Pohang 37673, South Korea. Park, Pyeongsu; Kwon, Dongup; Kim, Joonsung; Kim, Jangwoo, Seoul Natl Univ, Dept Elect \& Comp Engn, Seoul 08826, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "majdari@postech.ac.kr pyeongsu@snu.ac.kr dongup@snu.ac.kr joonsung90@snu.ac.kr jangwoo@snu.ac.kr", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Basic Science Research Program through the National Research Foundation of Korea (NRF) --- Ministry of Science, ICT \& Future Planning [NRF-2015M3C4A7065647, NRF-2017R1A2B3011038]; Institute for Information \& communications Technology Promotion (IITP) grant --- Korea government (MSIT) [R0190-15-2012]", funding-text = "This work was partly supported by Basic Science Research Program through the National Research Foundation of Korea (NRF) funded by the Ministry of Science, ICT \& Future Planning (NRF-2015M3C4A7065647, NRF-2017R1A2B3011038), and Institute for Information \& communications Technology Promotion (IITP) grant funded by the Korea government (MSIT) (No. R0190-15-2012). Mohammadamin Ajdari and Pyeongsu Park contributed equally to this work.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "CPU overhead; CPU utilization; data handling; data integrity; deduplication; deduplication opportunities; device-wide deduplications; Engines; field programmable gate arrays; file servers; flash memories; FPGA; FPGA-based prototype; Hardware; inline deduplication; modern storage servers; node-wide deduplication engine; Performance evaluation; power consumption; Power demand; power overhead; primary storage; Random access memory; Servers; software-based inline deduplications; SSD; SSD arrays; storage management; Storage server; Throughput", number-of-cited-references = "10", research-areas = "Computer Science", times-cited = "1", unique-id = "Ajdari:2018:SHB", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Hoseinzadeh:2018:FBS, author = "Morteza Hoseinzadeh", title = "Flow-Based Simulation Methodology", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "51--54", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2756051", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This paper presents flow-based simulation, a new methodology for evaluating novel and intricate computer system designs. The main idea of flow-based simulation is to keep the history of every simulated memory element, instead of its latest value, to make it time bonded so that sliding the time forward and backward changes the state of the system accordingly. Having this opportunity, new architectural designs can be evaluated in terms of timing and energy by implementing only a functional simulation. Due to serial execution, the process of the design in a flow-based simulation is traceable and easy to understand. As a result, comparing with cycle-driven and event-driven techniques, complicated algorithms can be evaluated much easier. Flow-based simulation simplifies the burden of the timing simulation, and consequently leads to faster development and simulation time.", acknowledgement = ack-nhfb, affiliation = "Hoseinzadeh, M (Reprint Author), Univ Calif San Diego, Dept Comp Sci \& Engn, La Jolla, CA 92093 USA. Hoseinzadeh, Morteza, Univ Calif San Diego, Dept Comp Sci \& Engn, La Jolla, CA 92093 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "mhoseinzadeh@cs.ucsd.edu", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computational modeling; Computer architectural simulator; Concurrent computing; cycle-driven techniques; digital simulation; event-driven techniques; flow-based simulation; flow-based simulation methodology; functional simulation; History; Integrated circuit modeling; Interference; intricate computer system designs; simulated memory element; simulation methodologies; Timing; timing simulation; Tools", keywords-plus = "FULL-SYSTEM", number-of-cited-references = "12", research-areas = "Computer Science", times-cited = "0", unique-id = "Hoseinzadeh:2018:FBS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Eyerman:2018:MSC, author = "Stijn Eyerman and Wim Heirman and Kristof {Du Bois} and Ibrahim Hur", title = "Multi-Stage {CPI} Stacks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "55--58", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2761751", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "CPI stacks are an intuitive way to visualize processor core performance bottlenecks. However, they often do not provide a full view on all bottlenecks, because stall events can occur concurrently. Typically one of the events is selected, which means information about the non-chosen stall events is lost. Furthermore, we show that there is no single correct CPI stack: stall penalties can be hidden, can overlap or can cause second-order effects, making total CPI more complex than just a sum of components. Instead of showing a single CPI stack, we propose to measure multiple CPI stacks during program execution: a CPI stack at each stage of the processor pipeline. This representation reveals all performance bottlenecks and provides a more complete view on the performance of an application. Multi-stage CPI stacks are easy to collect, which means that they can be included in a simulator with negligible slowdown, and that they can be included in the core hardware with limited overhead.", acknowledgement = ack-nhfb, affiliation = "Eyerman, S (Reprint Author), Intel Corp, Santa Clara, CA 95054 USA. Eyerman, Stijn; Heirman, Wim; Du Bois, Kristof; Hur, Ibrahim, Intel Corp, Santa Clara, CA 95054 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "Stijn.Eyerman@intel.com Wim.Heirman@intel.com Kristof.Du.Bois@intel.com Ibrahim.Hur@intel.com", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "1/f noise; Additives; CPI stacks; Hardware; microprocessor chips; multiple CPI stacks; multistage CPI stacks; Performance analysis; performance counters; performance evaluation; Performance gain; pipeline processing; Pipelines; processor core performance bottlenecks; processor pipeline; program execution; Proposals; Radiation detectors; single correct CPI stack; stall events; stall penalties; total CPI", number-of-cited-references = "7", research-areas = "Computer Science", times-cited = "0", unique-id = "Eyerman:2018:MSC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Zhang:2018:LHC, author = "Guowei Zhang and Daniel Sanchez", title = "Leveraging Hardware Caches for Memoization", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "59--63", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2762308", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Memoization improves performance and saves energy by caching and reusing the outputs of repetitive computations. Prior work has proposed software and hardware memoization techniques, but both have significant drawbacks. Software memoization suffers from high runtime overheads, and is thus limited to long computations. Conventional hardware memoization techniques achieve low overheads and can memoize short functions, but they rely on large, special-purpose memoization caches that waste significant area and energy. We propose MCACHE, a hardware technique that leverages data caches for memoization. MCACHE stores memoization tables in memory, and allows them to share cache capacity with normal program data. MCACHE introduces ISA and pipeline extensions to accelerate memoization operations, bridging the gap between software and conventional hardware techniques. Simulation results show that MCACHE improves performance by up to 21 x, outperforms software memoization by up to 2.2 x, and achieves similar or superior performance over conventional hardware techniques without any dedicated storage.", acknowledgement = ack-nhfb, affiliation = "Sanchez, D (Reprint Author), MIT CSAIL, Cambridge, MA 02139 USA. Zhang, Guowei; Sanchez, Daniel, MIT CSAIL, Cambridge, MA 02139 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "zhanggw@csail.mit.edu sanchez@csail.mit.edu", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "C-FAR, one of six SRC STAR-net centers by MARCO; C-FAR, one of six SRC STAR-net centers by DARPA; NSF [CAREER-1452994]", funding-text = "This work was supported in part by C-FAR, one of six SRC STAR-net centers by MARCO and DARPA, and by NSF grant CAREER-1452994.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Acceleration; Benchmark testing; cache capacity; cache storage; caches; Computer architecture; data caches; energy by caching; Hardware; hardware caches; Indexes; MCACHE; memoization; memoization operations; memoization tables; memory systems; power aware computing; Registers; runtime overheads; Semantics; Software; software memoization suffers; special-purpose memoization caches", number-of-cited-references = "17", research-areas = "Computer Science", times-cited = "0", unique-id = "Zhang:2018:LHC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Vakil-Ghahani:2018:CRP, author = "Armin Vakil-Ghahani and Sara Mahdizadeh-Shahri and Mohammad-Reza Lotfi-Namin and Mohammad Bakhshalipour and Pejman Lotfi-Kamran and Hamid Sarbazi-Azad", title = "Cache Replacement Policy Based on Expected Hit Count", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "64--67", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2762660", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Memory-intensive workloads operate on massive amounts of data that cannot be captured by last-level caches (LLCs) of modern processors. Consequently, processors encounter frequent off-chip misses, and hence, lose significant performance potential. One of the components of a modern processor that has a prominent influence on the off-chip miss traffic is LLC's replacement policy. Existing processors employ a variation of least recently used (LRU) policy to determine the victim for replacement. Unfortunately, there is a large gap between what LRU offers and that of Belady's MIN, which is the optimal replacement policy. Belady's MIN requires selecting a victim with the longest reuse distance, and hence, is unfeasible due to the need for knowing the future. In this work, we observe that there exists a strong correlation between the expected number of hits of a cache block and the reciprocal of its reuse distance. Taking advantage of this observation, we improve the efficiency of last-level caches through a low-cost-yet-effective replacement policy. We suggest a hit-count based victim-selection procedure on top of existing low-cost replacement policies to significantly improve the quality of victim selection in last-level caches without commensurate area overhead. Our proposal offers 12.2 percent performance improvement over the baseline LRU in a multi-core processor and outperforms EVA, which is the state-of-the-art replacement policy.", acknowledgement = ack-nhfb, affiliation = "Bakhshalipour, M (Reprint Author), Sharif Univ Technol, Dept Comp Engn, Tehran 1115511365, Iran. Vakil-Ghahani, Armin; Mahdizadeh-Shahri, Sara; Lotfi-Namin, Mohammad-Reza; Bakhshalipour, Mohammad; Sarbazi-Azad, Hamid, Sharif Univ Technol, Dept Comp Engn, Tehran 1115511365, Iran. Lotfi-Kamran, Pejman; Sarbazi-Azad, Hamid, Inst Res Fundamental Sci IPM, Sch Comp Sci, Tehran 1953833511, Iran.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "vakil@ce.sharif.edu smahdizadeh@ce.sharif.edu mrlotfi@ce.sharif.edu bakhshalipour@ce.sharif.edu plotfi@ipm.ir azad@sharif.edu", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Belady's MIN; cache block; cache replacement policy; cache storage; Correlation; expected hit count; History; hit-count based victim-selection procedure; last-level cache; last-level caches; longest reuse distance; low-cost replacement policies; low-cost-yet-effective replacement policy; Memory system; memory-intensive workload; memory-intensive workloads; Multicore processing; multicore processor; multiprocessing systems; off-chip miss traffic; off-chip misses; optimal replacement policy; performance evaluation; performance improvement; Prefetching; Proposals; Radiation detectors; replacement policy; victim selection", keywords-plus = "PREDICTION", number-of-cited-references = "16", ORCID-numbers = "Vakil Ghahani, Seyed Armin/0000-0002-4365-8932", research-areas = "Computer Science", times-cited = "2", unique-id = "Vakil-Ghahani:2018:CRP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Hadjilambrou:2018:SCV, author = "Zacharias Hadjilambrou and Shidhartha Das and Marco A. Antoniades and Yiannakis Sazeides", title = "Sensing {CPU} Voltage Noise Through Electromagnetic Emanations", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "68--71", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2766221", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This work proposes sensing CPU voltage noise through wireless electromagnetic (EM) emanations from the CPU. Compared to previous voltage monitoring methodologies, this approach is not intrusive as it does not require direct physical access to the monitored CPU. To prove the effectiveness of this approach, we use EM signal feedback to find the resonant frequency of the CPU power delivery network, and to generate a CPU voltage noise (dI/dt) virus. This study is performed on a modern out-of-order CPU that supports on-chip fine grain voltage monitoring. This on-chip voltage monitoring capability is used to validate the proposed EM methodology.", acknowledgement = ack-nhfb, affiliation = "Hadjilambrou, Z (Reprint Author), Univ Cyprus, CY-1678 Nicosia, Cyprus. Hadjilambrou, Zacharias; Antoniades, Marco A.; Sazeides, Yiannakis, Univ Cyprus, CY-1678 Nicosia, Cyprus. Das, Shidhartha, ARM, Cambridge CB1 9NJ, England.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "zhadji01@cs.ucy.ac.cy Shidhartha.Das@arm.com mantonia@ucy.ac.cy yanos@cs.ucy.ac.cy", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "European Union Horizon 2020 project Uniserver [688540]; University of Cyprus", funding-text = "This work is partially supported by European Union Horizon 2020 project Uniserver grant no. 688540 and the University of Cyprus.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "circuit resonance; CPU power delivery network; CPU voltage noise virus; electromagnetic emanations; EM signal feedback; Frequency measurement; Genetic algorithms; Hardware reliability; microprocessor chips; Monitoring; on-chip fine grain voltage monitoring; on-chip voltage monitoring capability; Resonant frequency; RLC circuits; Stress; stress tests; System-on-chip; voltage noise; voltage regulators; wireless electromagnetic emanations", number-of-cited-references = "19", ORCID-numbers = "Antoniades, Marco/0000-0002-9699-2387", research-areas = "Computer Science", times-cited = "2", unique-id = "Hadjilambrou:2018:SCV", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Jung:2018:PCU, author = "Daejin Jung and Sunjung Lee and Wonjong Rhee and Jung Ho Ahn", title = "Partitioning Compute Units in {CNN} Acceleration for Statistical Memory Traffic Shaping", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "72--75", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2773055", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Convolutional Neural Networks (CNNs) have become the default choice for processing visual information, and the design complexity of CNNs has been steadily increasing to improve accuracy. To cope with the massive amount of computation needed for such complex CNNs, the latest solutions utilize blocking of an image over the available dimensions (e.g., horizontal, vertical, channel, and kernel) and batching of multiple input images to improve data reuse in the memory hierarchy. While there has been a large collection of works on maximizing data reuse, only a few studies have focused on the memory bottleneck problem caused by limited bandwidth. Bandwidth bottleneck can easily occur in CNN acceleration as CNN layers have different sizes with varying computation needs and as batching is typically performed over each layer of CNN for an ideal data reuse. In this case, the data transfer demand for a layer can be relatively low or high compared to the computation requirement of the layer, and therefore temporal fluctuations in memory access can be induced eventually causing bandwidth problems. In this paper, we first show that there exists a high degree of fluctuation in memory access to computation ratio depending on CNN layers and functions in the layer being processed by the compute units (cores), where the compute units are tightly synchronized to maximize data reuse. Then we propose a strategy of partitioning the compute units where the cores within each partition process a batch of input data in a synchronous manner to maximize data reuse but different partitions run asynchronously. Because the partitions stay asynchronous and typically process different CNN layers at any given moment, the memory access traffic sizes of the partitions become statistically shuffled. Thus, the partitioning of compute units and asynchronous use of them make the total memory access traffic size be smoothened over time, and the degree of partitioning determines a tradeoff between data reuse efficiency and memory bandwidth utilization efficiency. We call this smoothing statistical memory traffic shaping, and we show that it can lead to 8.0 percent of performance gain on a commercial 64-core processor when running ResNet-50.", acknowledgement = ack-nhfb, affiliation = "Rhee, W; Ahn, JH (Reprint Author), Seoul Natl Univ, Dept Transdisciplinary Studies, Seoul 151742, South Korea. Jung, Daejin; Lee, Sunjung; Rhee, Wonjong; Ahn, Jung Ho, Seoul Natl Univ, Dept Transdisciplinary Studies, Seoul 151742, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "haijd@snu.ac.kr shiish@snu.ac.kr wrhee@snu.ac.kr gajh@snu.ac.kr", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Research Foundation of Korea grant --- Korea government [NRF-2017R1A2B2005416, NRF-2017R1E1A1A03070560]", funding-text = "This work was partially supported by the National Research Foundation of Korea grant funded by the Korea government (NRF-2017R1A2B2005416 and NRF-2017R1E1A1A03070560).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Acceleration; Bandwidth; bandwidth bottleneck; bandwidth problems; CNN; CNN acceleration; CNN layers; complex CNNs; comput units; computation requirement; Computational modeling; Computer architecture; Convolution; convolutional neural networks; data transfer demand; horizontal channel; ideal data reuse; image processing; Kernel; maximize data reuse; memory access traffic sizes; memory bandwidth utilization efficiency; memory bottleneck; memory bottleneck problem; memory hierarchy; microprocessor chips; multiprocessing systems; neural nets; Neural networks; parallel processing; partitioning; partitioning compute units; smoothing statistical memory traffic shaping; traffic shaping; vertical channel", number-of-cited-references = "16", oa = "Bronze", ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394 Rhee, Wonjong/0000-0002-2590-8774", research-areas = "Computer Science", times-cited = "0", unique-id = "Jung:2018:PCU", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{SanMiguel:2018:EMA, author = "Joshua {San Miguel} and Karthik Ganesan and Mario Badr and Natalie {Enright Jerger}", title = "The {EH} Model: Analytical Exploration of Energy-Harvesting Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "76--79", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2777834", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Energy-harvesting devices-which operate solely on energy collected from their environment-have brought forth a new paradigm of intermittent computing. These devices succumb to frequent power outages that would cause conventional systems to be stuck in a perpetual loop of restarting computation and never making progress. Ensuring forward progress in an intermittent execution model is difficult and requires saving state in non-volatile memory. In this work, we propose the EH model to explore the trade-offs associated with backing up data to maximize forward progress. In particular, we focus on the relationship between energy and forward progress and how they are impacted by backups/restores to derive insights for programmers and architects.", acknowledgement = ack-nhfb, affiliation = "San Miguel, J (Reprint Author), Univ Toronto, Edward S Rogers Sr Dept Elect \& Comp Engn, Toronto, ON M5S, Canada. San Miguel, Joshua; Ganesan, Karthik; Badr, Mario; Jerger, Natalie Enright, Univ Toronto, Edward S Rogers Sr Dept Elect \& Comp Engn, Toronto, ON M5S, Canada.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "joshua.sanmiguel@mail.utoronto.ca karthik.ganesan@mail.utoronto.ca mario.badr@mail.utoronto.ca enright@ece.utoronto.ca", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "analytical exploration; analytical model; Analytical models; Computational modeling; Computer architecture; conventional systems; EH model; energy harvesting; Energy-harvesting; energy-harvesting architectures; energy-harvesting devices; forward progress; frequent power outages; intermittent computing; intermittent execution model; Mathematical model; Nonvolatile memory; nonvolatile memory; perpetual loop; power aware computing; Power system reliability; random-access storage", number-of-cited-references = "11", ORCID-numbers = "Ganesan, Karthik/0000-0002-2541-1549", research-areas = "Computer Science", times-cited = "1", unique-id = "Miguel:2018:EMA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kim:2018:SPM, author = "Jihun Kim and Joonsung Kim and Pyeongsu Park and Jong Kim and Jangwoo Kim", title = "{SSD} Performance Modeling Using Bottleneck Analysis", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "80--83", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2779122", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Solid-State Drives (SSDs) are widely deployed for high throughput and low latency. However, the unpredictable access latency of SSDs makes it difficult to satisfy quality-of-service requirements and fully achieve the performance potential. In fact, it has been a fundamental challenge to accurately predict the access latency of modern SSDs performing many non-disclosed, device-specific intra-SSD optimizations. In this paper, we propose SSDcheck, a novel SSD performance model which accurately predicts the latency of future SSD accesses. After first identifying write buffer (WB) and garbage collection (GC) as the key components in modeling the access latency, we develop diagnosis snippets to identify the target SSDs critical intra-SSD parameters (e.g., WB size). Finally, we construct the SSDs access-latency model with the identified parameters. Our system-level evaluations using five commodity SSDs show that SSDcheck achieves up to 93 percent prediction accuracy. Our real-world prototype applying an SSDcheck-aware system-level request scheduling can significantly improve both throughput and tail latency by up to 2.1x and 1.46x, respectively.", acknowledgement = ack-nhfb, affiliation = "Kim, J (Reprint Author), Seoul Natl Univ, Dept Elect \& Comp Engn, Seoul 151742, South Korea. Kim, Jihun; Kim, Jong, POSTECH, Dept Comp Sci \& Engn, Pohang 37673, Gyeongbuk, South Korea. Kim, Joonsung; Park, Pyeongsu; Kim, Jangwoo, Seoul Natl Univ, Dept Elect \& Comp Engn, Seoul 151742, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "jihun735@postech.ac.kr jkim@postech.ac.kr pyeongsu@snu.ac.kr joonsung90@snu.ac.kr jangwoo@snu.ac.kr", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Basic Science Research Program through the National Research Foundation of Korea (NRF) --- Ministry of Science, ICT \& Future Planning [NRF-2015M3C4A7065647, NRF-2017R1A2B3011038]; Institute for Information \& communications Technology Promotion (IITP) grant --- Korea government (MSIT) [R0190-15-2012]", funding-text = "This work was partly supported by Basic Science Research Program through the National Research Foundation of Korea (NRF) funded by the Ministry of Science, ICT \& Future Planning (NRF-2015M3C4A7065647, NRF-2017R1A2B3011038), and Institute for Information \& communications Technology Promotion (IITP) grant funded by the Korea government (MSIT) (No. R0190-15-2012). Jihun Kim and Joonsung Kim are contributed equally to this work.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "bottleneck analysis; cache storage; commodity SSD; critical intra-SSD parameters; device-specific intraSSD optimizations; Engines; Feature extraction; flash memories; future SSD accesses; garbage collection; identified parameters; Interference; Monitoring; Predictive models; quality-of-service requirements; Resource management; scheduling; solid-state drives; SSD access-latency model; SSD check-aware system-level request scheduling; SSD performance model; SSD performance modeling; storage management; Throughput; unpredictable access latency", number-of-cited-references = "10", ORCID-numbers = "Kim, Jihun/0000-0001-8893-8447", research-areas = "Computer Science", times-cited = "0", unique-id = "Kim:2018:SPM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Angstadt:2018:MOS, author = "Kevin Angstadt and Jack Wadden and Vinh Dang and Ted Xie and Dan Kramp and Westley Weimer and Mircea Stan and Kevin Skadron", title = "{MNCaRT}: an Open-Source, Multi-Architecture Automata-Processing Research and Execution Ecosystem", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "84--87", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2780105", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/gnu.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "We present MNCaRT, a comprehensive software ecosystem for the study and use of automata processing across hardware platforms. Tool support includes manipulation of automata, execution of complex machines, high-speed processing of NFAs and DFAs, and compilation of regular expressions. We provide engines to execute automata on CPUs (with VASim and Intel Hyperscan), GPUs (with custom DFA and NFA engines), and FPGAs (with an HDL translator). We also introduce MNRL, an open-source, general-purpose and extensible state machine representation language developed to support MNCaRT. The representation is flexible enough to support traditional finite automata (NFAs, DFAs) while also supporting more complex machines, such as those which propagate multi-bit signals between processing elements. We hope that our ecosystem and representation language stimulates new efforts to develop efficient and specialized automata processing applications.", acknowledgement = ack-nhfb, affiliation = "Angstadt, K (Reprint Author), Univ Michigan, Comp Sci \& Engn Div, Dept Elect Engn \& Comp Sci, Ann Arbor, MI 48109 USA. Angstadt, Kevin; Weimer, Westley, Univ Michigan, Comp Sci \& Engn Div, Dept Elect Engn \& Comp Sci, Ann Arbor, MI 48109 USA. Wadden, Jack; Dang, Vinh; Xie, Ted; Kramp, Dan; Stan, Mircea; Skadron, Kevin, Univ Virginia, Dept Comp Sci, Charlottesville, VA 22904 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "angstadt@umich.edu wadden@virginia.edu vqd8a@virginia.edu ted.xie@virginia.edu dankramp@virginia.edu weimerw@umich.edu mircea@virginia.edu skadron@virginia.edu", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "US National Science Foundation [CCF-1116673, CCF-1629450, CCF-1619123, CNS-1619098]; AFRL [FA8750-15-2-0075]; Jefferson Scholars Foundation; Achievement Rewards for College Scientists (ARCS) Foundation; Xilinx; C-FAR, one of six centers of STARnet; Semiconductor Research Corporation program - MARCO; DARPA", funding-text = "This work was supported in part by grants from the US National Science Foundation (CCF-1116673, CCF-1629450, CCF-1619123, CNS-1619098), AFRL (FA8750-15-2-0075), Jefferson Scholars Foundation, Achievement Rewards for College Scientists (ARCS) Foundation, a grant from Xilinx, and support from C-FAR, one of six centers of STARnet, a Semiconductor Research Corporation program sponsored by MARCO and DARPA. Any opinions, findings and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of AFRL.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "accelerator architectures; Automata; Benchmark testing; complex machines; comprehensive software ecosystem; DFA; Ecosystems; efficient automata processing applications; Engines; extensible state machine representation language; Field programmable gate arrays; field programmable gate arrays; finite automata; finite state machines; formal languages; hardware platforms; high-speed processing; Intel Hyperscan; MNCaRT; NFA engines; open source software; Open source software; open source software; open-source-multiarchitecture automata-processing research; software tools; specialized automata processing applications; Tools; traditional finite automata", number-of-cited-references = "21", ORCID-numbers = "Angstadt, Kevin/0000-0002-0104-5257", research-areas = "Computer Science", researcherid-numbers = "Stan, Mircea/L-6219-2019", times-cited = "2", unique-id = "Angstadt:2018:MOS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Zheng:2018:EPE, author = "Hao Zheng and Ahmed Louri", title = "{EZ-Pass}: an Energy \& Performance-Efficient Power-Gating Router Architecture for Scalable {NoCs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "88--91", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2783918", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "With technology scaling into nanometer regime, static power is becoming the dominant factor in the overall power consumption of Network-on-Chips (NoCs). Static power can be reduced by powering off routers during consecutive idle time through power-gating techniques. However, power-gating techniques suffer from a large wake-up latency to wake up the powered-off routers. Recent research aims to improve the wake-up latency penalty by hiding it through early wake-up techniques. However, these techniques do not exploit the full advantage of power-gating due to the early wake-up. Consequently, they do not achieve significant power savings. In this paper, we propose an architecture called Easy Pass (EZ-Pass) router that remedies the large wake-up latency overheads while providing significant static power savings. The proposed architecture takes advantage of idle resources in the network interface to transmit packets without waking up the router. Additionally, the technique hides the wake-up latency by continuing to provide packet transmission during the wake-up phase. We use full system simulation to evaluate our EZ-Pass router on a 64-core NoC with a mesh topology using PARSEC benchmark suites. Our results show that the proposed router reduces static power by up to 31 percent and overall network latency by up to 32 percent as compared to early-wakeup optimized power-gating techniques.", acknowledgement = ack-nhfb, affiliation = "Zheng, H (Reprint Author), George Washington Univ, Dept Elect \& Comp Engn, Washington, DC 20052 USA. Zheng, Hao; Louri, Ahmed, George Washington Univ, Dept Elect \& Comp Engn, Washington, DC 20052 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "haozheng@gwu.edu louri@gwu.edu", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; early-wakeup optimized power-gating techniques; easy pass router; energy conservation; energy-efficient; energy-efficient power-gating router architecture; EZ-Pass router; Latches; mesh topology; network interface; network routing; network-on-chip; network-on-chips; nework-on-chips; Nickel; NoC; PARSEC benchmark suites; performance-efficient power-gating router architecture; Ports (Computers); power consumption; Power-gating; Routing; Routing protocols; scalable NoCs; static power savings; Switches; wake-up latency overheads; wake-up latency penalty; wake-up phase", keywords-plus = "ON-CHIP", number-of-cited-references = "16", research-areas = "Computer Science", times-cited = "0", unique-id = "Zheng:2018:EPE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Delshadtehrani:2018:NPM, author = "Leila Delshadtehrani and Schuyler Eldridge and Sadullah Canakci and Manuel Egele and Ajay Joshi", title = "{Nile}: a Programmable Monitoring Coprocessor", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "92--95", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2784416", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Researchers widely employ hardware performance counters (HPCs) as well as debugging and profiling tools in processors for monitoring different events such as cache hits, cache misses, and branch prediction statistics during the execution of programs. The collected information can be used for power, performance, and thermal management of the system as well as detecting anomalies or malicious behavior in the software. However, monitoring new or complex events using HPCs and existing tools is a challenging task because HPCs only provide a fixed pool of raw events to monitor. To address this challenge, we propose the implementation of a programmable hardware monitor in a complete system framework including the hardware monitor architecture and its interface with an in-order single-issue RISC-V processor as well as an operating system. As a proof of concept, we demonstrate how to programmatically implement a shadow stack using our hardware monitor and how the programmed shadow stack detects stack buffer overflow attacks. Our hardware monitor design incurs a 26 percent power overhead and a 15 percent area overhead over an unmodified RISC-V processor. Our programmed shadow stack has less than 3 percent performance overhead in the worst case.", acknowledgement = ack-nhfb, affiliation = "Delshadtehrani, L (Reprint Author), Boston Univ, Dept Elect \& Comp Engn, Boston, MA 02215 USA. Delshadtehrani, Leila; Eldridge, Schuyler; Canakci, Sadullah; Egele, Manuel; Joshi, Ajay, Boston Univ, Dept Elect \& Comp Engn, Boston, MA 02215 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "delshad@bu.edu schuye@bu.edu scanakci@bu.edu megele@bu.edu joshi@bu.edu", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [CCF-1533663]", funding-text = "We thank Prof. Jonathan Appavoo for providing invaluable help in designing the OS support and the software interface for Nile. This work was supported in part by NSF grant CCF-1533663.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "branch prediction statistics; cache hits; cache misses; cache storage; complete system framework; complex events; coprocessors; Coprocessors; debugging; fixed pool; Hardware; Hardware coprocessor; hardware monitor architecture; hardware monitor design; hardware performance counters; HPCs; Linux; malicious behavior; Monitoring; Nile; operating system; operating systems (computers); Pattern matching; performance evaluation; performance overhead; power overhead; profiling tools; Program processors; programmable hardware; programmable hardware monitor; programmable monitoring coprocessor; programmed shadow stack; raw events; reduced instruction set computing; Rockets; security; shadow stack; single-issue RISC-V processor; stack buffer overflow attack; stack buffer overflow attacks; thermal management; unmodified RISC-V processor", number-of-cited-references = "17", ORCID-numbers = "Joshi, AJay/0000-0002-3256-9942", research-areas = "Computer Science", times-cited = "0", unique-id = "Delshadtehrani:2018:NPM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Lee:2018:TTW, author = "Eojin Lee and Sukhan Lee and G. Edward Suh and Jung Ho Ahn", title = "{TWiCe}: Time Window Counter Based Row Refresh to Prevent Row-Hammering", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "96--99", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2787674", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Computer systems using DRAM are exposed to row-hammering attacks, which can flip data in a DRAM row without directly accessing a row but by frequently activating its adjacent ones. There have been a number of proposals to prevent row-hammering, but they either incur large area/performance overhead or provide probabilistic protection. In this paper, we propose a new row-hammering mitigation mechanism named Time Window Counter based row refresh (TWiCe) which prevents row-hammering by using a small number of counters without performance overhead. We first make a key observation that the number of rows that can cause flipping their adjacent ones (aggressor candidates) is limited by the maximum values of row activation frequency and DRAM cell retention time. TWiCe exploits this limit to reduce the required number of counter entries by counting only actually activated DRAM rows and periodically invalidating the entries that are not activated frequently enough to be an aggressor. We calculate the maximum number of required counter entries per DRAM bank, with which row-hammering prevention is guaranteed. We further improve energy efficiency by adopting a pseudo-associative cache design to TWiCe. Our analysis shows that TWiCe incurs no performance overhead on normal DRAM operations and less than 0.7 percent area and energy overheads over contemporary DRAM devices.", acknowledgement = ack-nhfb, affiliation = "Lee, E; Ahn, JH (Reprint Author), Seoul Natl Univ, Seoul 151742, South Korea. Lee, Eojin; Lee, Sukhan; Ahn, Jung Ho, Seoul Natl Univ, Seoul 151742, South Korea. Suh, G. Edward, Cornell Univ, Ithaca, NY 14850 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "yohoyo@snu.ac.kr infy1026@snu.ac.kr suh@csl.cornell.edu gajh@snu.ac.kr", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NRF of Korea [NRF-2017R1A2B2005416]; R\&D program of MOTIE/KEIT [10077609]; IDEC (EDA tool)", funding-text = "This work was partially supported by the NRF of Korea grant (NRF-2017R1A2B2005416), by the R\&D program of MOTIE/KEIT (10077609), and by IDEC (EDA tool).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache storage; Computer architecture; DRAM; DRAM cell retention time; DRAM chips; DRAM row; energy efficiency; Microprocessors; Monitoring; performance overhead; Probabilistic logic; pseudoassociative cache design; Random access memory; refresh; reliability; row activation frequency; row-hammering; row-hammering attacks; row-hammering mitigation mechanism; row-hammering prevention; time window counter based row refresh; Time-frequency analysis; TWiCe", keywords-plus = "MEMORY", number-of-cited-references = "15", ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394 Suh, Edward/0000-0001-6409-9888", research-areas = "Computer Science", times-cited = "1", unique-id = "Lee:2018:TTW", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Rakshit:2018:LLO, author = "Joydeep Rakshit and Kartik Mohanram", title = "{LEO}: Low Overhead Encryption {ORAM} for Non-Volatile Memories", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "100--104", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2795621", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Data confidentiality attacks utilizing memory access patterns threaten exposure of data in modern main memories. Oblivious RAM (ORAM) is an effective cryptographic primitive developed to thwart access-pattern-based attacks in DRAM-based systems. However, in emerging non-volatile memory (NVM) systems, the increased writes due to encryption of multiple data blocks on every Path ORAM (state-of-the-art efficient ORAM) access impose significant energy, lifetime, and performance overheads. LEO (Low overhead Encryption ORAM) is an efficient Path ORAM encryption architecture that addresses the high write overheads of ORAM integration in NVMs, while providing security equivalent to the baseline Path ORAM. LEO reduces NVM cell writes by securely decreasing the number of block encryptions during the write phase of a Path ORAM access. LEO uses a secure, two-level counter mode encryption framework that opportunistically eliminates re-encryption of unmodified blocks, reducing NVM writes. Our evaluations show that on average, LEO decreases NVM energy by 60 percent, improves lifetime by 1.51 x, and increases performance by 9 percent over the baseline Path ORAM.", acknowledgement = ack-nhfb, affiliation = "Rakshit, J (Reprint Author), Univ Pittsburgh, Dept Elect \& Comp Engn, Pittsburgh, PA 15260 USA. Rakshit, Joydeep; Mohanram, Kartik, Univ Pittsburgh, Dept Elect \& Comp Engn, Pittsburgh, PA 15260 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "joydeep.rakshit@pitt.edu kmram@pitt.edu", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "baseline path ORAM; block encryptions; Computer architecture; cryptography; data confidentiality attacks; DRAM chips; efficient path ORAM encryption architecture; emerging nonvolatile memory systems; Encryption; LEO; low-overhead encryption ORAM; memory access patterns; memory security; multiple data blocks; non-volatile memory; nonvolatile memories; Nonvolatile memory; NVM; Oblivious RAM; ORAM integration; path ORAM access; Random access memory; random-access storage; System-on-chip; two-level counter mode encryption framework", number-of-cited-references = "21", ORCID-numbers = "Rakshit, Joydeep/0000-0002-3670-4814", research-areas = "Computer Science", times-cited = "0", unique-id = "Rakshit:2018:LLO", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Do:2018:CRL, author = "Sang Wook Stephen Do and Michel Dubois", title = "Core Reliability: Leveraging Hardware Transactional Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "105--108", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2791433", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Modern microprocessors are more vulnerable to transient faults or soft errors than ever before due to design trends mandating low supply voltage and reduced noise margins, shrinking feature sizes and increased transistor density for fast, low-power circuits. As industry now supports Hardware Transactional Memory (HTM), the features of HTM can be leveraged to add core resiliency to transient errors. In this paper, we propose a novel microarchitecture for transient error detection and recovery based on time redundancy and backward error recovery leveraging HTM's existing features especially its rollback mechanism. We provide implementation details for single-core reliability, minimizing additions to existing HTM supports. We evaluate the performance overheads of the single core with the reliability feature by comparing it to the base machine without the reliability feature. Finally we show how single-core reliability can be extended to multi-core reliability.", acknowledgement = ack-nhfb, affiliation = "Do, SWS (Reprint Author), Univ Southern Calif, Dept Elect Engn, EEB200, Elect Engn Bldg, Los Angeles, CA 90089 USA. Do, Sang Wook Stephen, Univ Southern Calif, Dept Elect Engn, EEB200, Elect Engn Bldg, Los Angeles, CA 90089 USA. Dubois, Michel, Univ Southern Calif, Dept Elect Engn, EEB228, Elect Engn Bldg, Los Angeles, CA 90089 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "sdo@usc.edu dubois@usc.edu", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Science Foundation [CCF-0954211]", funding-text = "The authors wish to thank Daniel Wong at UC Riverside for advice on setting up the SPEC 2006 benchmark suite. This material is based upon work supported by the National Science Foundation under Grant CCF-0954211.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "backward error recovery; computer system organization; core resiliency; design trends; Electrical engineering; error detection; feature sizes; Fingerprint recognition; Hardware; hardware transactional memory; Hardware Transactional Memory; hardware transactional memory; HTM; integrated circuit design; integrated circuit reliability; low supply voltage; low-power circuits; low-power electronics; memory architecture; microprocessor chips; modern microprocessors; Multicore processing; multicore reliability; noise margins; performance and reliability; Registers; Reliability; rollback mechanism; single-core reliability; soft errors; time redundancy; Transient analysis; transient error detection; transient error recovery; transient faults; transistor density", keywords-plus = "TRANSIENT-FAULT RECOVERY; MULTIPROCESSORS; CONSISTENCY; SUPPORT", number-of-cited-references = "30", research-areas = "Computer Science", times-cited = "0", unique-id = "Do:2018:CRL", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kaliorakis:2018:SAM, author = "Manolis Kaliorakis and Athanasios Chatzidimitriou and George Papadimitriou and Dimitris Gizopoulos", title = "Statistical Analysis of Multicore {CPUs} Operation in Scaled Voltage Conditions", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "109--112", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2798604", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Designers try to reduce the voltage margins of CPU chips to gain energy without sacrificing reliable operation. Statistical analysis methods are appealing to predict the safe operational margins at the system level as they do not induce area overheads and they can be applied during manufacturing or after the chips' release to the market. In this study, we present a comprehensive statistical analysis of the behavior of ARMv8 64-bit cores that are part of the enterprise 8-core X-Gene 2 micro-server family when they operate in scaled voltage conditions. Our prediction schemes that use real hardware counters as input are based on linear regression models with several feature selection techniques that aim to predict the safe voltage margins of any given workload when the cores operate in scaled conditions. Our findings show that our model is able to accurately predict safe voltage margins that provide up to 20.28\% power savings.", acknowledgement = ack-nhfb, affiliation = "Kaliorakis, M (Reprint Author), Univ Athens, Comp Architecture Lab, Athens, Greece. Kaliorakis, Manolis; Chatzidimitriou, Athanasios; Papadimitriou, George; Gizopoulos, Dimitris, Univ Athens, Comp Architecture Lab, Athens, Greece.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "manoliskal@di.uoa.gr achatz@di.uoa.gr georgepap@di.uoa.gr dgizop@di.uoa.gr", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "H2020 Programme of the European Union through the UniServer Project [688540]", funding-text = "This work is funded by the H2020 Programme of the European Union through the UniServer Project (Grant Agreement 688540).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "area overheads; ARMv8 cores; comprehensive statistical analysis; Computational modeling; Computer crashes; CPU chips; design margins; Energy-efficient computing; enterprise 8-core X-Gene 2 microserver family; feature selection; feature selection techniques; Hardware; hardware counters; hardware reliability; Linear regression; linear regression models; microprocessor chips; multicore CPUs operation; multiprocessing systems; power aware computing; power savings; prediction schemes; Predictive models; regression analysis; safe operational margins; safe voltage margins; scaled voltage conditions; statistical methods; system level; voltage margins; Voltage measurement; word length 64 bit", keywords-plus = "NOISE", number-of-cited-references = "10", ORCID-numbers = "Gizopoulos, Dimitris/0000-0002-1613-9061 Chatzidimitriou, Athanasios/0000-0001-8161-7165", research-areas = "Computer Science", researcherid-numbers = "Gizopoulos, Dimitris/U-2731-2018", times-cited = "2", unique-id = "Kaliorakis:2018:SAM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Khoram:2018:AAA, author = "Soroosh Khoram and Yue Zha and Jing Li", title = "An Alternative Analytical Approach to Associative Processing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "113--116", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2789424", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Associative Processing (AP) is a promising alternative to the Von Neumann model as it addresses the memory wall problem through its inherent in-memory computations. However, because of the countless design parameter choices, comparisons between implementations of two so radically different models are challenging for simulation-based methods. To tackle these challenges, we develop an alternative analytical approach based on a new concept called architecturally-determined complexity. Using this method, we asymptotically evaluate the runtime/storage/energy bounds of the two models, i.e., AP and Von Neumann. We further apply the method to gain more insights into the performance bottlenecks of traditional AP and develop a new machine model named Two Dimensional AP to address these limitations. Finally, we experimentally validate our analytical method and confirm that the simulation results match our theoretical projections.", acknowledgement = ack-nhfb, affiliation = "Khoram, S (Reprint Author), Univ Wisconsin, Dept Elect \& Comp Engn, 1415 Johnson Dr, Madison, WI 53706 USA. Khoram, Soroosh; Zha, Yue; Li, Jing, Univ Wisconsin, Dept Elect \& Comp Engn, 1415 Johnson Dr, Madison, WI 53706 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "khoram@wisc.edu yzha.3@wisc.edu jli@ece.wisc.edu", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "alternative analytical approach; analysis of algorithms and problem complexity; analytical method; Analytical models; architecturally-determined complexity; associative processing; Associative Processing; Associative processors; Complexity theory; Computational modeling; Computer architecture; content-addressable storage; countless design parameter choices; in-memory computations; machine model; memory wall problem; modeling techniques; models of computation; Parallel processing; Runtime; runtime-storage-energy bounds; simulation-based methods; traditional AP; two dimensional AP; Two dimensional displays; Von Neumann model", number-of-cited-references = "10", research-areas = "Computer Science", times-cited = "0", unique-id = "Khoram:2018:AAA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Khatamifard:2018:MSD, author = "S. Karen Khatamifard and M. Hassan Najafi and Ali Ghoreyshi and Ulya R. Karpuzcu and David J. Lilja", title = "On Memory System Design for Stochastic Computing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "117--121", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2804926", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Growing uncertainty in design parameters (and therefore, in design functionality) renders stochastic computing particularly promising, which represents and processes data as quantized probabilities. However, due to the difference in data representation, integrating conventional memory (designed and optimized for non-stochastic computing) in stochastic computing systems inevitably incurs a significant data conversion overhead. Barely any stochastic computing proposal to-date covers the memory impact. In this paper, as the first study of its kind to the best of our knowledge, we rethink the memory system design for stochastic computing. The result is a seamless stochastic system, StochMem, which features analog memory to trade the energy and area overhead of data conversion for computation accuracy. In this manner StochMem can reduce the energy (area) overhead by up-to 52.8\% (93.7\%) at the cost of at most 0.7\% loss in computation accuracy.", acknowledgement = ack-nhfb, affiliation = "Khatamifard, SK (Reprint Author), Univ Minnesota, Minneapolis, MN 55455 USA. Khatamifard, S. Karen; Najafi, M. Hassan; Ghoreyshi, Ali; Karpuzcu, Ulya R.; Lilja, David J., Univ Minnesota, Minneapolis, MN 55455 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "khatami@umn.edu najaf011@umn.edu ghore002@umn.edu ukarpuzc@umn.edu lilja@umn.edu", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "US National Science Foundation [CCF-1408123, XPS-CCA-1438286]", funding-text = "This work was supported in part by US National Science Foundation grant no. CCF-1408123 and XPS-CCA-1438286. Any opinions, findings and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of the National Science Foundation.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "analog memory; Analog memory; analog memory; computation accuracy; conventional memory; Data conversion; data representation; design functionality; design parameters; energy-efficient design; Image processing; Image sensors; memory architecture; memory impact; memory system design; near-sensor processing; probability; seamless stochastic system; Sensors; significant data conversion overhead; Stochastic computing; stochastic computing proposal to-date; stochastic computing systems; stochastic processes; Stochastic systems; System analysis and design", keywords-plus = "COMPUTATION", number-of-cited-references = "16", ORCID-numbers = "Najafi, M. Hassan/0000-0002-4655-6229 Lilja, David/0000-0003-3785-8206", research-areas = "Computer Science", researcherid-numbers = "Najafi, M. Hassan/I-2952-2019", times-cited = "1", unique-id = "Khatamifard:2018:MSD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Mouris:2018:TSB, author = "Dimitris Mouris and Nektarios Georgios Tsoutsos and Michail Maniatakos", title = "{TERMinator} Suite: Benchmarking Privacy-Preserving Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "122--125", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2812814", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Security and privacy are fundamental objectives characterizing contemporary cloud computing. Despite the wide adoption of encryption for protecting data in transit and at rest, data in use remains unencrypted inside cloud processors and memories, as computation is not applicable on encrypted values. This limitation introduces security risks, as unencrypted values can be leaked through side-channels or hardware Trojans. To address this problem, encrypted architectures have recently been proposed, which leverage homomorphic encryption to natively process encrypted data using datapaths of thousands of bits. In this case, additional security protections are traded for higher performance penalties, which drives the need for more efficient architectures. In this work, we develop benchmarks specifically tailored to homomorphic computers, to enable comparisons across different architectures. Our benchmark suite, dubbed TERMinator, is unique as it avoids ``termination problems'' that prohibit making control-flow decisions and evaluating early termination conditions based on encrypted data, as these can leak information. Contrary to generic suites that ignore the fundamental challenges of encrypted computation, our algorithms are tailored to the security primitives of the target encrypted architecture, such as the existence of branching oracles. In our experiments, we compiled our benchmarks for the Cryptoleq architecture and evaluated their performance for a range of security parameters.", acknowledgement = ack-nhfb, affiliation = "Tsoutsos, NG (Reprint Author), NYU, New York, NY 10003 USA. Mouris, Dimitris, Univ Athens, GR-10679 Athens, Greece. Tsoutsos, Nektarios Georgios; Maniatakos, Michail, NYU, New York, NY 10003 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "jimouris@di.uoa.gr nektarios.tsoutsos@nyu.edu michail.maniatakos@nyu.edu", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NYU Abu Dhabi Global Ph.D. Student Fellowship program", funding-text = "This work was partially sponsored by the NYU Abu Dhabi Global Ph.D. Student Fellowship program. D. Mouris thanks Orestis Polychroniou for the fruitful discussions.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; Benchmarks; Cloud computing; cloud computing; cloud processors; Computer architecture; control-flow decisions; cryptography; Cryptoleq architecture; data privacy; dubbed TERMinator; encrypted architectures; encrypted computation; encrypted data; encrypted values; Encryption; hardware Trojans; higher performance penalties; homomorphic computers; homomorphic encryption; leakage prevention; performance evaluation; privacy-preserving architecture benchmarking; Program processors; security parameters; security protections; security risks; target encrypted architecture; termination problem; TERMinator suite; unencrypted values", number-of-cited-references = "14", ORCID-numbers = "Maniatakos, Michail/0000-0001-6899-0651", research-areas = "Computer Science", times-cited = "0", unique-id = "Mouris:2018:TSB", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Choukse:2018:CEM, author = "Esha Choukse and Mattan Erez and Alaa Alameldeen", title = "{CompressPoints}: an Evaluation Methodology for Compressed Memory Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "126--129", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2821163", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Current memory technology has hit a wall trying to scale to meet the increasing demands of modern client and datacenter systems. Data compression is a promising solution to this problem. Several compressed memory systems have been proposed in the past years [1], [2], [3], [4]. Unfortunately, a reasonable methodology to evaluate these systems is missing. In this paper, we identify the challenges for evaluating main memory compression. We propose an effective methodology to evaluate a compressed memory system by proposing mechanisms to: (i) incorporate correct virtual address translation, (ii) choose a region in the application that is representative of the compression ratio, in addition to regular metrics like IPC and cache hit rates, and (iii) choose a representative region for multi-core workloads, bringing down the correlation error from 12.8 to 3.8 percent.", acknowledgement = ack-nhfb, affiliation = "Choukse, E (Reprint Author), Univ Texas Austin, Austin, TX 78712 USA. Choukse, Esha; Erez, Mattan, Univ Texas Austin, Austin, TX 78712 USA. Alameldeen, Alaa, Intel Labs, Santa Clara, CA 95054 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "esha.choukse@utexas.edu mattan.erez@utexas.edu alaa.r.alameldeen@intel.com", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; cache storage; compressed memory; compressed memory system; Compression; compression ratio; Computational modeling; computer centres; Correlation; current memory technology; data compression; datacenter systems; DRAM; evaluation; evaluation methodology; Hardware; Linux; main memory compression; Measurement; memory; memory architecture; Memory management; methodology; modern client; multi-core; multicore workloads; representative regions; storage management; translation; workloads", number-of-cited-references = "8", ORCID-numbers = "Choukse, Esha/0000-0003-0371-5522", research-areas = "Computer Science", times-cited = "0", unique-id = "Choukse:2018:CEM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kim:2018:ZRV, author = "Seikwon Kim and Wonsang Kwak and Changdae Kim and Jaehyuk Huh", title = "{Zebra} Refresh: Value Transformation for Zero-Aware {DRAM} Refresh Reduction", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "130--133", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2822808", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Refresh operations consume growing portions of DRAM power with increasing DRAM capacity. To reduce the power consumption of such refresh operations, this paper proposes a novel value-aware refresh reduction technique exploiting the abundance of zero values in the memory contents. The proposed Zebra refresh architecture transforms the value and mapping of DRAM data to increase consecutive zero values, and skips a refresh operation for a row containing zero values entirely. Zebra converts memory blocks to base and delta values, inspired by a prior compression technique. Once values are converted, bits are transposed to place consecutive zeros matching the refresh granularity. The experimental results show Zebra refresh can reduce DRAM refresh operations by 43 percent on average for a set of benchmark applications.", acknowledgement = ack-nhfb, affiliation = "Huh, J (Reprint Author), Korea Adv Inst Sci \& Technol, Sch Comp, Daejeon 34141, South Korea. Kim, Seikwon; Kwak, Wonsang; Kim, Changdae; Huh, Jaehyuk, Korea Adv Inst Sci \& Technol, Sch Comp, Daejeon 34141, South Korea. Kim, Seikwon, Samsung Elect Co Ltd, Samsung Res, Suwon 443803, Gyeonggi Do, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "seikwon@calab.kaist.ac.kr wskwak@calab.kaist.ac.kr cdkim@calab.kaist.ac.kr jhuh@calab.kaist.ac.kr", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Research Foundation of Korea [NRF-2016R1A2B4013352]; Institute for Information \& communications Technology Promotion [IITP-2017-000466]; Ministry of Science and ICT, Korea", funding-text = "This work is supported by the National Research Foundation of Korea (NRF-2016R1A2B4013352) and by the Institute for Information \& communications Technology Promotion (IITP-2017-000466). Both grants are funded by the Ministry of Science and ICT, Korea.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; data compression; data content conversion; data reduction; DRAM chips; DRAM data; DRAM energy; DRAM power; DRAM refresh; DRAM refresh operations; memory contents; Memory management; Microprocessors; power aware computing; power consumption; Power demand; Random access memory; refresh granularity; Transforms; value transformation; value-aware refresh reduction; Zebra refresh architecture; zero values; zero-aware DRAM refresh reduction", keywords-plus = "ENERGY", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "0", unique-id = "Kim:2018:ZRV", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kwon:2018:CMC, author = "Youngeun Kwon and Minsoo Rhu", title = "A Case for Memory-Centric {HPC} System Architecture for Training Deep Neural Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "134--138", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2823302", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "As the models and the datasets to train deep learning (DL) models scale, system architects are faced with new challenges, one of which is the memory capacity bottleneck, where the limited physical memory inside the accelerator device constrains the algorithm that can be studied. We propose a memory-centric deep learning system that can transparently expand the memory capacity accessible to the accelerators while also providing fast inter-device communication for parallel training. Our proposal aggregates a pool of memory modules locally within the device-side interconnect, which are decoupled from the host interface and function as a vehicle for transparent memory capacity expansion. Compared to conventional systems, our proposal achieves an average 2: 1 x speedup on eight DL applications and increases the system-wide memory capacity to tens of TBs.", acknowledgement = ack-nhfb, affiliation = "Rhu, M (Reprint Author), Pohang Univ Sci \& Technol, Pohang 790784, Gyeongsangbuk D, South Korea. Kwon, Youngeun; Rhu, Minsoo, Pohang Univ Sci \& Technol, Pohang 790784, Gyeongsangbuk D, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "kyeg9404@gmail.com minsoo.rhu@gmail.com", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Samsung Research Funding Center of Samsung Electronics [SRFC-TB1703-03]", funding-text = "This work was supported by Samsung Research Funding Center of Samsung Electronics under Project Number SRFC-TB1703-03.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Computer architecture; conventional systems; deep learning; deep learning models scale; device-side interconnect; fast inter-device communication; Graphics processing units; hardware acceleration; learning (artificial intelligence); Machine learning; memory architecture; memory capacity bottleneck; memory modules; memory-centric deep learning system; memory-centric HPC system architecture; neural nets; neural network; parallel processing; parallel training; Performance evaluation; shared memory systems; storage management; system architects; system architecture; system-wide memory capacity; Systems architecture; Training; training deep neural networks; transparent memory capacity expansion; Virtualization", keywords-plus = "DESIGN", number-of-cited-references = "18", research-areas = "Computer Science", researcherid-numbers = "Rhu, Minsoo/O-6167-2018", times-cited = "0", unique-id = "Kwon:2018:CMC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Ipek:2018:BLL, author = "Engin Ipek and Florian Longnos and Shihai Xiao and Wei Yang", title = "Bit-Level Load Balancing: a New Technique for Improving the Write Throughput of Deeply Scaled {STT-MRAM}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "139--142", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2819979", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Emerging non-volatile memories (NVMs) have drawn significant attention as potential DRAM replacements. STT-MRAM is one of the most promising NVMs due to its relatively low write energy, high speed, and high endurance. However, STT-MRAM suffers from its own scaling problems. As the size of the access transistor is decreased to reduce the cell area, the magnitude of the switching current that is supplied to the storage element decreases. The reduced switching current significantly lengthens the switching time, which makes write throughput a significant performance bottleneck for a memory system constructed from dense STT-MRAM cells. We introduce bit-level load balancing, a new technique that mitigates the performance overhead of limited write throughput in high-density, STT-MRAM based main memories. Bit-level load balancing takes advantage of the observation that many of the bits within a row of STT-MRAM remain unchanged when performing a write. The key idea is to architect the memory system such that different columns of different rows can be simultaneously written to an STT-MRAM subarray. By interleaving in time the bit updates from multiple writes, bit level load balancing improves average system performance by 19 percent, and comes within 6 percent of the performance of a DRAM based system.", acknowledgement = ack-nhfb, affiliation = "Ipek, E (Reprint Author), Univ Rochester, Dept Comp Sci, CSB Room 422, Rochester, NY 14627 USA. Ipek, E (Reprint Author), Univ Rochester, Dept Elect \& Comp Engn, CSB Room 422, Rochester, NY 14627 USA. Ipek, Engin, Univ Rochester, Dept Comp Sci, CSB Room 422, Rochester, NY 14627 USA. Ipek, Engin, Univ Rochester, Dept Elect \& Comp Engn, CSB Room 422, Rochester, NY 14627 USA. Longnos, Florian; Xiao, Shihai; Yang, Wei, Huawei Technol Co Ltd, Shenzhen 115371, Guangdong, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "ipek@cs.rochester.edu florian.longnos@huawei.com xiaoshihai@huawei.com william.yangwei@huawei.com", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "bit level load balancing; bit-level load balancing; Computer architecture; deeply scaled STT-MRAM; dense STT-MRAM cells; DRAM chips; Load management; memory system; memory systems; Microprocessors; MRAM devices; non-volatile memories; nonvolatile memories; NVMs; performance bottleneck; Random access memory; resource allocation; STT-MRAM; STT-MRAM based main memories; STT-MRAM subarray; Switches; Throughput; Transistors; write throughput", keywords-plus = "PERFORMANCE; DESIGN; ENERGY", number-of-cited-references = "19", research-areas = "Computer Science", times-cited = "0", unique-id = "Ipek:2018:BLL", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Iliakis:2018:DMS, author = "Konstantinos Iliakis and Sotirios Xydis and Dimitrios Soudris", title = "Decoupled {MapReduce} for Shared-Memory Multi-Core Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "143--146", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2827929", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Modern multi-core processors exhibit high integration densities, e.g., up to several tens of cores. Multiple programming frameworks have emerged to facilitate the development of highly parallel applications. The MapReduce programming model, after having demonstrated its usability in the area of distributed computing systems, has been adapted to the needs of shared-memory multi-processors showing promising results in comparison with conventional multi-threaded libraries, e.g., pthreads. In this paper we enhance the traditional MapReduce architecture by decoupling the map and combine phases in order to boost parallel execution. We show that combiners' memory intensive features limit the system's degree of parallelism, thus resulting in sub-optimal hardware utilization, leaving space for further performance improvements. The proposed decoupled MapReduce architecture is evaluated into a NUMA server platform, showing that the adoption of the De-MapR runtime enables more efficient hardware utilization and competent run-time improvements. We demonstrate that the proposed solution achieves execution speedups of up to 2.46x compared to a state-of-the-art, shared-memory MapReduce library.", acknowledgement = ack-nhfb, affiliation = "Iliakis, K (Reprint Author), Natl Tech Univ Athens, Zografos 15780, Greece. Iliakis, Konstantinos; Xydis, Sotirios; Soudris, Dimitrios, Natl Tech Univ Athens, Zografos 15780, Greece.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "konstantinos.iliakis@cern.ch sxydis@microlab.ntua.gr dsoudris@microlab.ntua.gr", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; Containers; decoupled MapReduce architecture; distributed computing systems; hardware utilization; highly parallel applications; Instruction sets; Libraries; MapReduce; MapReduce programming model; modern multicore processors; multi-cores; multiple programming frameworks; parallel architectures; parallel execution; Parallel processing; parallel programming; Runtime; runtime systems; shared memory systems; shared-memory MapReduce library; shared-memory multicore architectures; shared-memory multiprocessors; sub-optimal hardware utilization; Task analysis", number-of-cited-references = "13", ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847", research-areas = "Computer Science", researcherid-numbers = "Soudris, Dimitrios/O-8843-2019", times-cited = "0", unique-id = "Iliakis:2018:DMS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Li:2018:BSB, author = "Zhaoshi Li and Leibo Liu and Yangdong Deng and Shouyi Yin and Shaojun Wei", title = "Breaking the Synchronization Bottleneck with Reconfigurable Transactional Execution", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "147--150", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2828402", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The advent of FPGA-based hybrid architecture offers the opportunity of customizing memory subsystems to enhance the overall system performance. However, it is not straightforward to design efficient FPGA circuits for emerging FPGAs applications such as in-memory database and graph analytics, which heavily depend on concurrent data structures (CDS'). Highly dynamic behaviors of CDS' have to be orchestrated by synchronization primitives for correct execution. These primitives induce overwhelming memory traffic for synchronizations on FPGAs. This paper proposes a novel method for systematically exploring and exploiting memory-level parallelism (MLP) of CDS by transactional execution on FPGAs. Inspired by the idea that semantics of transactions can be implemented in a more efficient and scalable manner on FPGAs than on CPUs, we propose a transaction-based reconfigurable runtime system for capturing MLP of CDS'. Experiments on linked-list and skip-list show our approach achieves 5.18x and 1.55x throughput improvement on average than lock-based FPGA implementations and optimized CDS algorithms on a state-of-the-art multi-core CPU respectively.", acknowledgement = ack-nhfb, affiliation = "Liu, LB (Reprint Author), Tsinghua Univ, Natl Lab Informat Sci \& Technol, Beijing 100084, Peoples R China. Li, Zhaoshi; Liu, Leibo; Deng, Yangdong; Yin, Shouyi; Wei, Shaojun, Tsinghua Univ, Natl Lab Informat Sci \& Technol, Beijing 100084, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "li-zs12@mail.tsinghua.edu.cn liulb@tsinghua.edu.cn dengyd@tsinghua.edu.cn yinsy@tsinghua.edu.cn wsj@tsinghua.edu.cn", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Natural Science Foundation of China [61672317]; National Science Technology Major Project [2016ZX01012101]", funding-text = "This work was supported in part by National Natural Science Foundation of China (No. 61672317) and National Science Technology Major Project (No. 2016ZX01012101).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "concurrent data structures; data structures; Data structures; data structures; efficient FPGA circuits; field programmable gate arrays; Field programmable gate arrays; FPGA-based hybrid architecture; graph analytics; heterogeneous systems; highly dynamic behaviors; in-memory database; Instruction sets; memory subsystems; memory traffic; memory-level parallelism; MLP; multicore CPU; optimized CDS algorithms; parallel architectures; Programming; Reconfigurable hardware; reconfigurable transactional execution; Semantics; synchronisation; Synchronization; synchronization bottleneck; synchronization primitives; system performance enhancement; Throughput; transaction-based reconfigurable runtime system", number-of-cited-references = "12", research-areas = "Computer Science", times-cited = "0", unique-id = "Li:2018:BSB", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Ipek:2018:VWC, author = "Engin Ipek and Florian Longnos and Shihai Xiao and Wei Yang", title = "Vertical Writes: Closing the Throughput Gap between Deeply Scaled {STT-MRAM} and {DRAM}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "151--154", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2820027", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "STT-MRAMis a second generation MRAM technology that addresses many of the scaling problems of earlier generation magnetic RAMs, and is a promising candidate to replace DRAM due to its high operational speed, scalable energy characteristics, and high write endurance. However, making the density of STT-MRAM competitive with that of DRAM while maintaining DRAM-like write throughput has proven challenging. Reducing the area of an STT-MRAM cell requires decreasing the width of the cell access transistor, which lowers the magnitude of the switching current supplied to the storage element during writes, and significantly hampers the switching speed. Consequently, write throughput constitutes a fundamental performance bottleneck for memory systems built from deeply scaled, dense STT-MRAM cells. This paper introduces vertical writes, a new technique that improves the write throughput of memory systems built from high-density STT-MRAM. Vertical writes exploit the observation that once the switching voltage has been applied across the bit lines and source lines in an STT-MRAM array, it is possible to initiate the write operation for additional cells that are attached to the same column by simply turning on the corresponding word lines. By leveraging the ability to write a 0 or a 1 to multiple cells at once, vertical writes improve average system performance by 21 percent, and enable an STT-MRAM based system to come within 5 percent of the performance of a DRAM based system.", acknowledgement = ack-nhfb, affiliation = "Ipek, E (Reprint Author), Univ Rochester, Rochester, NY 14627 USA. Ipek, Engin, Univ Rochester, Rochester, NY 14627 USA. Longnos, Florian; Xiao, Shihai; Yang, Wei, Huawei Technol Co Ltd, Shenzhen 518129, Guangdong, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "ipek@cs.rochester.edu florian.longnos@huawei.com florian.longnos@huawei.com william.yangwei@huawei.com", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cell access transistor; Computer architecture; Decoding; deeply scaled STT-MRAM cells; dense STT-MRAM cells; DRAM based system; DRAM chips; DRAM-like write throughput; earlier generation magnetic RAMs; generation MRAM technology; high operational speed; high write endurance; high-density STT-MRAM; magnetic tunnelling; Memory systems; memory systems; Memory systems; Microprocessors; MRAM devices; non-volatile memories; Random access memory; random-access storage; scalable energy characteristics; STT-MRAM; STT-MRAM array; STT-MRAM based system; Switches; switching current; switching speed; Throughput; throughput gap; write operation; Writing", keywords-plus = "PERFORMANCE; DESIGN", number-of-cited-references = "23", research-areas = "Computer Science", times-cited = "0", unique-id = "Ipek:2018:VWC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Gan:2018:AIC, author = "Yu Gan and Christina Delimitrou", title = "The Architectural Implications of Cloud Microservices", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "155--158", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2839189", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Cloud services have recently undergone a shift from monolithic applications to microservices, with hundreds or thousands of loosely-coupled microservices comprising the end-to-end application. Microservices present both opportunities and challenges when optimizing for quality of service (QoS) and cloud utilization. In this paper we explore the implications cloud microservices have on system bottlenecks, and datacenter server design. We first present and characterize an end-to-end application built using tens of popular open-source microservices that implements a movie renting and streaming service, and is modular and extensible. We then use the end-to-end service to study the scalability and performance bottlenecks of microservices, and highlight implications they have on the design of datacenter hardware. Specifically, we revisit the long-standing debate of brawny versus wimpy cores in the context of microservices, we quantify the I-cache pressure they introduce, and measure the time spent in computation versus communication between microservices over RPCs. As more cloud applications switch to this new programming model, it is increasingly important to revisit the assumptions we have previously used to build and manage cloud systems.", acknowledgement = ack-nhfb, affiliation = "Delimitrou, C (Reprint Author), Cornell Univ, Ithaca, NY 14850 USA. Gan, Yu; Delimitrou, Christina, Cornell Univ, Ithaca, NY 14850 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "lyg397@cornell.edu delimitrou@cornell.edu", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "application studies resulting in better multiple-processor systems; architectural implications; Cloud computing; cloud computing; cloud microservices; cloud utilization; computer centres; datacenter server design; distributed applications; Electric breakdown; end-to-end service; Motion pictures; movie renting; Open source software; open-source microservices; power aware computing; QoS; quality of service; Quality of service; quality of service; Servers; streaming service; Super (very large) computers; Videos", number-of-cited-references = "20", research-areas = "Computer Science", times-cited = "0", unique-id = "Gan:2018:AIC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Shwartz:2018:DMI, author = "Ofir Shwartz and Yitzhak Birk", title = "Distributed Memory Integrity Trees", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "159--162", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2822705", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Ensuring the correct execution of a program running on untrusted computing platforms, wherein the OS, hypervisor, and all off-CPU-chip hardware, including memory, are untrusted, (also) requires protecting the integrity of the memory content against replay attacks. This requires dedicated tracking structures and in-chip state storage. For this purpose, integrity trees are used in various forms, varying in complexity, size, and performance; yet, existing integrity trees do not address distributed, shared-memory computations, for which one must also ensure the integrity of the coherence state of the memory. Observing that a block not residing at a given node merely needs to be known by that node as such, we present the novel Distributed Integrity Tree (DIT) method, and show that it can be used effectively to extend existing integrity trees to parallel and distributed environments. Using DIT, we constructed a Distributed Merkle Tree, a Distributed Bonsai Merkle Tree, and a distributed Intel SGX's Memory Encryption Engine integrity mechanism. All these extensions entail negligible overhead.", acknowledgement = ack-nhfb, affiliation = "Shwartz, O (Reprint Author), Technion, Elect Engn Dept, IL-3200003 Haifa, Israel. Shwartz, Ofir; Birk, Yitzhak, Technion, Elect Engn Dept, IL-3200003 Haifa, Israel.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "ofirshw@tx.technion.ac.il birk@ee.technion.ac.il", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Hasso Plattner Institute", funding-text = "This work was supported in part by the Hasso Plattner Institute.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "coherence state; computer security; correct execution; cryptography; data integrity; Data transfer; distributed Bonsai Merkle tree; Distributed computing; Distributed databases; distributed environment; distributed integrity tree method; distributed Intel SGX's Memory Encryption Engine integrity mechanism; distributed memory integrity; Encryption; hypervisor; in-chip state storage; integrity tree; memory content; Memory management; Metadata; off-CPU-chip hardware; operating systems (computers); parallel environment; parallel processing; shared memory; shared memory systems; shared-memory computations; trees (mathematics); trusted computing; untrusted computing platforms", keywords-plus = "PERFORMANCE", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "0", unique-id = "Shwartz:2018:DMI", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Yun:2018:RPP, author = "Ji-Tae Yun and Su-Kyung Yoon and Jeong-Geun Kim and Bernd Burgstaller and Shin-Dug Kim", title = "Regression Prefetcher with Preprocessing for {DRAM--PCM} Hybrid Main Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "163--166", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2841835", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "This research is to design an effective hybrid main memory structure for graph processing applications, because it is quite expensive to use only high-speed DRAM for such applications. Thus, we propose a DRAM-PCM hybrid main memory structure to reduce the cost and energy consumption and design regression prefetch scheme to cope with irregular access patterns in large graph processing workloads. In addition, the prefetch includes preprocessing algorithm to maximize prefetching performance. Our experimental evaluation shows a performance improvement of 36 percent over a conventional DRAM model, 15 percent over existing prefetch models such as GHB/PC, SMS, and AMPM, and 6 percent over the latest model.", acknowledgement = ack-nhfb, affiliation = "Kim, SD (Reprint Author), Yonsei Univ, Dept Comp Sci, Seoul 03722, South Korea. Yun, Ji-Tae; Yoon, Su-Kyung; Kim, Jeong-Geun; Burgstaller, Bernd; Kim, Shin-Dug, Yonsei Univ, Dept Comp Sci, Seoul 03722, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "jty11@yonsei.ac.kr sk.yoon@yonsei.ac.kr junggeun@yonsei.ac.kr bburg@yonsei.ac.kr sdkim@yonsei.ac.kr", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Next Generation Information Computing Development Program through the National Research Foundation of Korea (NRF) --- Ministry of Science, ICT \& Future Planning [NRF-2015M3C4A7065522]; Samsung Electronics; Yonsei University", funding-text = "This research was partially supported by the Next Generation Information Computing Development Program through the National Research Foundation of Korea (NRF) funded by the Ministry of Science, ICT \& Future Planning (NRF-2015M3C4A7065522) and by an Industry-Academy joint research program between Samsung Electronics and Yonsei University.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "buffer management; conventional DRAM model; cost reduction; design regression prefetch scheme; DRAM chips; effective hybrid main memory structure; energy consumption reduction; Engines; graph processing applications; graph theory; high-speed DRAM; irregular access patterns; large graph processing workloads; Load modeling; machine learning; main memory; Memory management; PCM; Phase change materials; phase change memories; prefetch models; Prefetching; prefetching performance; preprocessing algorithm; Random access memory; storage management; Training data", number-of-cited-references = "19", research-areas = "Computer Science", times-cited = "0", unique-id = "Yun:2018:RPP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Zhang:2018:RFA, author = "Jiangwei Zhang and Donald {Kline, Jr.} and Long Fang and Rami Melhem and Alex K. Jones", title = "{RETROFIT}: Fault-Aware Wear Leveling", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "167--170", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2840137", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Phase-change memory (PCM) and resistive memory (RRAM) are promising alternatives to traditional memory technologies. However, both PCM and RRAM suffer from limited write endurance and due to process variation from scaling, increasing number of early cell failures continue to put pressure on wear-leveling and fault tolerance techniques. In this paper, we propose RETROFIT, which leverages the spare ``gap'' row used as temporary storage in wear leveling to also be used strategically to guard against early cell wear out. RETROFIT is compatible with error correction schemes targeted at mitigating stuck-at faults and provides benefits when single or multiple spare rows are available. RETROFIT enhances lifetime by as much as 107 percent over traditional gap-based wear leveling and 8 percent over perfectly uniform wear leveling with a similar overhead. Furthermore, RETROFIT scales better than wear-leveling combined with error correction as process variation increases.", acknowledgement = ack-nhfb, affiliation = "Zhang, JW (Reprint Author), Natl Univ Def Technol, Changsha 410073, Hunan, Peoples R China. Zhang, JW (Reprint Author), Univ Pittsburgh, ECE Dept, Pittsburgh, PA 15261 USA. Zhang, Jiangwei; Fang, Long, Natl Univ Def Technol, Changsha 410073, Hunan, Peoples R China. Zhang, Jiangwei; Fang, Long, Univ Pittsburgh, ECE Dept, Pittsburgh, PA 15261 USA. Melhem, Rami, Univ Pittsburgh, CS Dept, Pittsburgh, PA 15260 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "jiz148@pitt.edu dek61@pitt.edu lfang@nudt.edu.cn melhem@cs.pitt.edu akjones@pitt.edu", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Aging; and fault-tolerance; Computer architecture; early cell failures; early cell wear; Emerging memories; error correction; Error correction; Error correction codes; fault tolerance; fault tolerance techniques; fault-aware wear leveling; fault-tolerance; multiple spare rows; PCM; perfectly uniform wear leveling; Phase change materials; phase change memories; process variation; Random access memory; random-access storage; Registers; reliability; resistive memory; RETROFIT scales; RRAM; single rows; spare gap row; traditional memory technologies; wear; wear-leveling", number-of-cited-references = "15", ORCID-numbers = "Kline, Jr, Donald/0000-0002-4414-1513", research-areas = "Computer Science", times-cited = "2", unique-id = "Zhang:2018:RFA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kulkarni:2018:LAI, author = "Neeraj Kulkarni and Feng Qi and Christina Delimitrou", title = "Leveraging Approximation to Improve Datacenter Resource Efficiency", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "171--174", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2845841", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Cloud multi-tenancy is typically constrained to a single interactive service colocated with one or more batch, low-priority services, whose performance can be sacrificed. Approximate computing applications offer the opportunity to enable tighter colocation among multiple applications whose performance is important. We present Pliant, a lightweight cloud runtime that leverages the ability of approximate computing applications to tolerate some loss in output quality to boost the utilization of shared servers. During periods of high contention, Pliant employs incremental and interference-aware approximation to reduce interference in shared resources. We evaluate Pliant across different approximate applications, and show that it preserves QoS for all co-scheduled workloads, while incurring at most a 5 percent loss in output quality.", acknowledgement = ack-nhfb, affiliation = "Delimitrou, C (Reprint Author), Cornell Univ, Ithaca, NY 14850 USA. Kulkarni, Neeraj; Qi, Feng; Delimitrou, Christina, Cornell Univ, Ithaca, NY 14850 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "nsk49@cornell.edu fq26@cornell.edu delimitrou@cornell.edu", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Approximate computing; approximate computing applications; cloud computing; Cloud computing; cloud multitenancy; co-scheduled workloads; computer centres; datacenter resource efficiency; Interference; interference-aware approximation; lightweight cloud runtime; low-priority services; Monitoring; Pliant; QoS; quality of service; Quality of service; Runtime; scheduling; scheduling and task partitioning; shared resources; single interactive service; Super (very large) computers; support for dynamic compilation; Switches", keywords-plus = "ACCURACY-AWARE OPTIMIZATION; PROGRAMS", number-of-cited-references = "20", ORCID-numbers = "Qi, Feng/0000-0002-0759-5268 Kulkarni, Neeraj/0000-0003-0768-0187", research-areas = "Computer Science", times-cited = "0", unique-id = "Kulkarni:2018:LAI", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{AlBarakat:2018:MFM, author = "Laith M. AlBarakat and V. Paul Gratz and Daniel A. Jim{\'e}nez", title = "{MTB-Fetch}: Multithreading Aware Hardware Prefetching for Chip Multiprocessors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "175--178", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2847345", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "To fully exploit the scaling performance in Chip Multiprocessors, applications must be divided into semi-independent processes that can run concurrently on multiple cores within a system. One major class of such applications, shared-memory, multi-threaded applications, requires programmers insert thread synchronization primitives (i.e., locks, barriers, and condition variables) in their critical sections to synchronize data access between processes. For this class of applications, scaling performance requires balanced per-thread workloads with little time spent in critical sections. In practice, however, threads often waste significant time waiting to acquire locks/barriers in their critical sections, leading to thread imbalance and poor performance scaling. Moreover, critical sections often stall data prefetchers that mitigate the effects of long critical section stalls by ensuring data is preloaded in the core caches when the critical section is complete. In this paper we examine a pure hardware technique to enable safe data prefetching beyond synchronization points in CMPs. We show that successful prefetching beyond synchronization points requires overcoming two significant challenges in existing prefetching techniques. First, we find that typical data prefetchers are designed to trigger prefetches based on current misses. This approach this works well for traditional, continuously executing, single-threaded applications. However, when a thread stalls on a synchronization point, it typically does not produce any new memory references to trigger a prefetcher. Second, even in the event that a prefetch were to be correctly directed to read beyond a synchronization point, it will likely prefetch shared data from another core before this data has been written. While this prefetch would be considered ``accurate'' it is highly undesirable, because such a prefetch would lead to three extra ``ping-pong'' movements back and forth between private caches in the producing and consuming cores, incurring more latency and energy overhead than without prefetching. We develop a new data prefetcher, Multi-Thread B-Fetch (MTBFetch), built as an extension to a previous single-threaded data prefetcher. MTBFetch addresses both issues in prefetching for shared memory multi-threaded workloads. MTB-Fetch achieves a speedup of 9.3 percent for multi-threaded applications with little additional hardware.", acknowledgement = ack-nhfb, affiliation = "AlBarakat, LM (Reprint Author), Texas A\&M Univ, Dept Elect \& Comp Engn, College Stn, TX 77843 USA. AlBarakat, Laith M.; Gratz, Paul, V, Texas A\&M Univ, Dept Elect \& Comp Engn, College Stn, TX 77843 USA. Jimenez, Daniel A., Texas A\&M Univ, Dept Comp Sci \& Engn, College Stn, TX 77843 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "lalbarakat@tamu.edu pgratz@tamu.edu djimenez@cse.tamu.edu", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Science Foundation [I/UCRC-1439722, CCF-1649242, CCF-1216604/1332598]; Intel Corp.", funding-text = "We thank the National Science Foundation, which partially supported this work through grants I/UCRC-1439722, CCF-1649242 and CCF-1216604/1332598 and Intel Corp. for their generous support.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache storage; Chip multiprocessor; Chip Multiprocessors; CMPs; core caches; data access synchronization; energy overhead; Hardware; hardware prefetching; long critical section stalls; microprocessor chips; MTB-Fetch; multi-threading; Multicore processing; multiple cores; multithread B-fetch; multithreading aware hardware prefetching; per-thread workloads; poor performance scaling; Prefetching; prefetching techniques; private caches; pure hardware technique; Scalability; scaling performance; semiindependent processes; shared memory; shared memory multithreaded workloads; shared memory systems; single-threaded applications; single-threaded data prefetcher; storage management; synchronisation; Synchronization; synchronization point; thread imbalance; thread synchronization primitives; typical data prefetchers", keywords-plus = "PROCESSORS", number-of-cited-references = "17", research-areas = "Computer Science", times-cited = "1", unique-id = "AlBarakat:2018:MFM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Vijayaraghavan:2018:MBA, author = "Thiruvengadam Vijayaraghavan and Amit Rajesh and Karthikeyan Sankaralingam", title = "{MPU--BWM}: Accelerating Sequence Alignment", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "179--182", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2849064", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "DNA sequencing and assembly spans life-altering applications like disease diagnosis to answering questions about our ancestry. Sequencing involves state-of-the-art machines generating nucleic acid sequences (AGCT) from wet samples like blood or salvia, followed by aligning these sequences against known reference sequences. Due to the rapid advancement in sequence generation machines relative to Moore's law, the second step (alignment) has now become the bottleneck. Today's state-of-the-art technology for alignment runs software like BWA-MEM on a cluster of high performance general purpose machines that cannot keep up with the rapid rate of data generated by each new generation of sequencer machines. Recent proposals from academia that claim orders of magnitude alignment speedup come at a cost of significant disruption to the hardware and software currently in use in the industry. In this work, we propose MPU-BWM, a hardware-software solution that achieves orders of magnitude speedup (57 x over single core x86) on the state-of-the-art BWA-MEM algorithm, with non-intrusive integration to existing processing clusters and with minimal modifications to the BWA-MEM software.", acknowledgement = ack-nhfb, affiliation = "Vijayaraghavan, T (Reprint Author), SimpleMachines Inc, Madison, WI 53719 USA. Vijayaraghavan, Thiruvengadam; Sankaralingam, Karthikeyan, SimpleMachines Inc, Madison, WI 53719 USA. Rajesh, Amit, James Madison Mem High Sch, Madison, WI 53717 USA. Sankaralingam, Karthikeyan, Univ Wisconsin, Madison, WI 53706 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "vijay@simplemachinesinc.com amitrajesh200@gmail.com karu@cs.wisc.edu", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "assembly spans life-altering applications; bioinformatics; bioinformatics (genome or protein) databases; BWA-MEM software; disease diagnosis; diseases; DNA; DNA sequencing; Engines; genomics; Hardware; hardware-software solution; Heterogeneous (hybrid) systems; high performance general purpose machines; magnitude alignment speedup; Moore's law; MPU-BWM; nucleic acid sequences; parallel architectures; parallel processing; Pipelines; Program processors; reference sequences; Rockets; sequence alignment; sequence generation machines; sequencer machines; sequences; Sequential analysis; sequential machines", number-of-cited-references = "15", ORCID-numbers = "Rajesh, Amit/0000-0003-1679-5517", research-areas = "Computer Science", times-cited = "1", unique-id = "Vijayaraghavan:2018:MBA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{DePestel:2018:RRP, author = "Sander {De Pestel} and Sam {Van den Steen} and Shoaib Akram and Lieven Eeckhout", title = "{RPPM}: Rapid Performance Prediction of Multithreaded Applications on Multicore Hardware", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "183--186", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2849983", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper proposes RPPM which, based on a microarchitecture-independent profile of a multithreaded application, predicts its performance on a previously unseen multicore platform. RPPM breaks up multithreaded program execution into epochs based on synchronization primitives, and then predicts per-epoch active execution times for each thread and synchronization overhead to arrive at a prediction for overall application performance. RPPM predicts performance within 12 percent on average (27 percent max error) compared to cycle-level simulation. We present a case study to illustrate that RPPM can be used for making accurate multicore design trade-offs early in the design cycle.", acknowledgement = ack-nhfb, affiliation = "De Pestel, S (Reprint Author), Univ Ghent, B-9000 Ghent, Belgium. De Pestel, Sander; Van den Steen, Sam; Akram, Shoaib; Eeckhout, Lieven, Univ Ghent, B-9000 Ghent, Belgium.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "sander.depestel@ugent.be sam.vandensteen@ugent.be shoaib.akram@ugent.be lieven.eeckhout@ugent.be", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Agency for Innovation by Science and Technology in Flanders (IWT); European Research Council (ERC) [741097]", funding-text = "Sander De Pestel is supported through a doctoral fellowship by the Agency for Innovation by Science and Technology in Flanders (IWT). Additional support is provided through the European Research Council (ERC) Advanced Grant agreement no. 741097.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "accurate multicore design trade-offs; active execution times; Computational modeling; Instruction sets; Mathematical model; micro-architecture; Microarchitecture; microarchitecture-independent profile; microprocessor chips; Modeling; multi-threaded; multi-threading; multicore hardware; Multicore processing; multiprocessing systems; multithreaded application; multithreaded program execution; performance; Predictive models; rapid performance prediction; RPPM; Synchronization; synchronization overhead; synchronization primitives; unseen multicore platform", number-of-cited-references = "12", ORCID-numbers = "Van den Steen, Sam/0000-0003-3630-2214", research-areas = "Computer Science", times-cited = "1", unique-id = "Pestel:2018:RRP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Zhao:2018:KOA, author = "Wenyi Zhao and Quan Chen and Minyi Guo", title = "{KSM}: Online Application-Level Performance Slowdown Prediction for Spatial Multitasking {GPGPU}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "187--191", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2851207", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Colocating multiple applications on the same spatial multitasking GPGPU improves the system-wide throughput. However, the colocated applications are slowed down differently due to the contention on streaming multiprocessors (SMs), L2 cache and global memory bandwidth. The ability to precisely predict application slowdowns online is useful in many scenarios, e.g., ensuring fair pricing in multi-tenant Cloud systems. Prior work on predicting application slowdown is either inaccurate, due to the ignoring of contention on SMs, or inefficient, due to the expensive sequential profiling of concurrent applications via runtime environment switching. To solve the above problem, we propose KSM that enables precise and efficient application-level slowdown prediction without priori application knowledge. KSM is proposed based on the observation that hardware event statistics caused by the colocated applications are strongly correlated with their slowdowns. In more detail, KSM builds a slowdown model based on the hardware event statistics using machine learning techniques offline. At runtime, KSM collects the hardware event statistics, and predicts the slowdowns of all the colocated applications based on the model. Our experimental results show that KSM has negligible runtime overhead and precisely predicts the application-level slowdowns with the prediction error smaller than 9.7 percent.", acknowledgement = ack-nhfb, affiliation = "Zhao, WY (Reprint Author), Shanghai Jiao Tong Univ, Dept Comp Sci \& Engn, Shanghai 200240, Peoples R China. Zhao, Wenyi; Chen, Quan; Guo, Minyi, Shanghai Jiao Tong Univ, Dept Comp Sci \& Engn, Shanghai 200240, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "wenyizhao@sjtu.edu.cn chen-quan@cssjtu.edu.cn guo-my@cssjtu.edu.cn", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Basic Research 973 Program of China [2015CB352403]; National Natural Science Foundation of China (NSFC) [61602301, 61632017]", funding-text = "This work is partially sponsored by the National Basic Research 973 Program of China (No. 2015CB352403), the National Natural Science Foundation of China (NSFC) (61602301, 61632017). Quan Chen and Minyi Guo are co-corresponding authors of this paper.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "application-level slowdowns; Bandwidth; cloud computing; colocated applications; computer centres; concurrent applications; Graphics processing units; graphics processing units; Hardware; hardware event statistics; interference; Interference; interference; Kernel; KSM; learning (artificial intelligence); machine learning technique; multiprocessing systems; multitenant cloud systems; online application-level performance slowdown prediction; precise application-level slowdown prediction; priori application knowledge; Resource management; scalability; Slowdown prediction; SM; spatial multitasking GPGPU; spatial multitasking GPGPUs; system-wide throughput; Training", number-of-cited-references = "13", ORCID-numbers = "Zhao, Wenyi/0000-0001-7308-9542 Chen, Quan/0000-0001-5832-0347", research-areas = "Computer Science", times-cited = "0", unique-id = "Zhao:2018:KOA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Swami:2018:AAS, author = "Shivam Swami and Kartik Mohanram", title = "{ARSENAL}: Architecture for Secure Non-Volatile Memories", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "192--196", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2863281", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Whereas data persistence in non-volatile memories (NVMs) enables instant data recovery (IDR) in the face of power/system failures, it also exposes NVMs to data confidentiality and integrity attacks. Counter mode encryption and Merkle Tree authentication are established measures to thwart data confidentiality and integrity attacks, respectively, in NVMs. However, these security mechanisms require high overhead atomic security meta-data updates on every write-back in order to support IDR in NVMs. This increases memory traffic and negatively impacts system performance and memory lifetime. Architecture for Secure Non-Volatile Memories (ARSENAL) is an IDR-preserving, low cost, high performance security solution that protects NVM systems against data confidentiality and integrity attacks. ARSENAL synergistically integrates (i) Smart Writes for Faster Transactions (SWIFT), a novel technique to reduce the performance overhead of atomic security meta-data updates on every write-back, with (ii) Terminal BMT Updates (TBU), a novel BMT-consistency-preserving technique, to facilitate IDR in the face of power/system failures. Our evaluations show that on average, ARSENAL improves system performance (measured in IPC) by 2.26x (4x), reduces memory traffic overhead by 1.47x (1.88x), and improves memory lifetime by 2x (3.5x) in comparison to conventional IDR-preserving 64-bit (128-bit) encryption+authentication.", acknowledgement = ack-nhfb, affiliation = "Mohanram, K (Reprint Author), Univ Pittsburgh, Dept Elect \& Comp Engn, Pittsburgh, PA 15260 USA. Swami, Shivam; Mohanram, Kartik, Univ Pittsburgh, Dept Elect \& Comp Engn, Pittsburgh, PA 15260 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "shs173@pitt.edu kmram@pitt.edu", da = "2019-06-20", doc-delivery-number = "GT5EV", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [CCF-1217738]", funding-text = "This research was supported by NSF Award CCF-1217738. We also thank the editor and the reviewers for their constructive comments that have helped us elaborate and improve the content of the paper.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "architecture for nonvolatile memories; ARSENAL; authentication; Authentication; authentication; cryptography; data confidentiality; data integrity; data integrity attacks; data persistence; encryption; Encryption; failure analysis; hardware security; high overhead atomic security meta-data updates; high performance security solution; IDR; IDR-preserving encryption-authentication; instant data recovery; integrated circuit reliability; memory architecture; memory lifetime; Memory management; memory traffic overhead; Non-volatile memories; Nonvolatile memory; NVMs; power failures; Random access memory; random-access storage; security mechanisms; smart writes for faster transactions; SWIFT; system failures; system performance; terminal BMT updates", keywords-plus = "ENCRYPTION; PERFORMANCE", number-of-cited-references = "28", research-areas = "Computer Science", times-cited = "0", unique-id = "Swami:2018:AAS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Basak:2018:ECC, author = "Abanti Basak and Xing Hu and Shuangchen Li and Sang Min Oh and Yuan Xie", title = "Exploring Core and Cache Hierarchy Bottlenecks in Graph Processing Workloads", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "197--200", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2864964", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Graph processing is an important analysis technique for a wide range of big data problems. The ability to explicitly represent relationships between entities gives graph analytics significant performance advantage over traditional relational databases. In this paper, we perform an in-depth data-aware characterization of graph processing workloads on a simulated multi-core architecture, find bottlenecks in the core and the cache hierarchy that are not highlighted by previous characterization work, and analyze the behavior of the specific application data type causing the corresponding bottleneck. We find that load-load dependency chains involving different application data types form the primary bottleneck in achieving a high memory-level parallelism in graph processing workloads. We also observe that the private L2 cache has a negligible contribution to performance. whereas the shared L3 cache has higher performance sensitivity. In addition, we present a study on the effectiveness of several replacement policies. Finally, we study the relationship between different graph algorithms and the access volumes to the different data types. Overall, we provide useful insights and guidelines toward developing a more optimized CPU-based architecture for high performance graph processing.", acknowledgement = ack-nhfb, affiliation = "Basak, A (Reprint Author), Univ Calif Santa Barbara, Santa Barbara, CA 93106 USA. Basak, Abanti; Hu, Xing; Li, Shuangchen; Oh, Sang Min; Xie, Yuan, Univ Calif Santa Barbara, Santa Barbara, CA 93106 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "abasak@umail.ucsb.edu xinghu.cs@gmail.com shuangchenli@ece.ucsb.edu sangminoh@umail.ucsb.edu yuanxie@gmail.com", da = "2019-06-20", doc-delivery-number = "GT5EV", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "US National Science Foundation [1730309/1719160/1500848]; CRISP, one of six centers in JUMP, a Semiconductor Research Corporation program - DARPA", funding-text = "This work was supported in part by US National Science Foundation 1730309/1719160/1500848 and by CRISP, one of six centers in JUMP, a Semiconductor Research Corporation program sponsored by DARPA.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "application data type; Arrays; Benchmark testing; Big Data; big data problems; Cache Hierarchy; cache hierarchy bottlenecks; cache storage; CPU-based architecture; graph algorithms; graph analytics; Graph Processing; graph processing workloads; graph theory; Guidelines; high performance graph processing; in-depth data-aware characterization; Layout; load-load dependency chains; mathematics computing; Memory-Level Parallelism; memory-level parallelism; microprocessor chips; multicore architecture; multiprocessing systems; parallel architectures; performance evaluation; performance sensitivity; private L2 cache; Random access memory; Sensitivity; shared L3 cache", number-of-cited-references = "13", ORCID-numbers = "Oh, Sang Min/0000-0001-7119-6934", research-areas = "Computer Science", times-cited = "1", unique-id = "Basak:2018:ECC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Khatamifard:2018:NCC, author = "S. Karen Khatamifard and Longfei Wang and Selcuk K{\"o}se and Ulya R. Karpuzcu", title = "A New Class of Covert Channels Exploiting Power Management Vulnerabilities", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "201--204", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2860006", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Effective runtime power management requires hardware activity to be tracked at a very fine granularity in both space and time in order to meet diverse workload performance requirements within a tight power budget. As the available instantaneous power budget itself represents a shared resource, this practically translates into finding the optimal allocation of the power budget among active tasks of execution. Covert communication over a previously unexplored class of channels thereby becomes possible, which forms the focus of this paper.", acknowledgement = ack-nhfb, affiliation = "Khatamifard, SK (Reprint Author), Univ Minnesota, Minneapolis, MN 55455 USA. Khatamifard, S. Karen; Karpuzcu, Ulya R., Univ Minnesota, Minneapolis, MN 55455 USA. Wang, Longfei; Kose, Selcuk, Univ S Florida, Tampa, FL 33620 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "khatami@umn.edu longfei@mail.usf.edu ukarpuzc@umn.edu", da = "2019-06-20", doc-delivery-number = "HA2CO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF CAREER Award [CCF-1350451]; NSF/SRC [CNS-1715286]; Cisco Systems Research Award", funding-text = "This work is supported in part by the NSF CAREER Award under Grant CCF-1350451, in part by the NSF/SRC Award under Grant CNS-1715286, and in part by the Cisco Systems Research Award.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "covert channels; covert communication; diverse workload performance requirements; fine granularity; Hardware; hardware activity; instantaneous power budget; Monitoring; optimal allocation; power aware computing; Power demand; Power management vulnerabilities; power management vulnerabilities; Power system management; runtime power management; security of data; Software; System-on-chip; tight power budget; Voltage control", number-of-cited-references = "8", research-areas = "Computer Science", times-cited = "0", unique-id = "Khatamifard:2018:NCC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kondguli:2018:BUS, author = "Sushant Kondguli and Michael Huang", title = "{Bootstrapping}: Using {SMT} Hardware to Improve Single-Thread Performance", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "205--208", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2859945", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Decoupled look-ahead (DLA) architectures have been shown to be an effective way to improve single-thread performance. However, a default implementation requires an additional core. While an SMT flavor is possible, a naive implementation is inefficient and thus slow. In this paper, we propose an optimized implementation called Bootstrapping that makes DLA just as effective on a single (SMT) core as using two cores. While fusing two cores can improve single-thread performance by 1.23x, Bootstrapping provides a speedup of 1.51.", acknowledgement = ack-nhfb, affiliation = "Kondguli, S (Reprint Author), Univ Rochester, Dept Elect \& Comp Engn, Rochester, NY 14627 USA. Kondguli, Sushant; Huang, Michael, Univ Rochester, Dept Elect \& Comp Engn, Rochester, NY 14627 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "sushant.kondguli@rochester.edu michael.huang@rochester.edu", da = "2019-06-20", doc-delivery-number = "HA2CO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [1514433, 1533842]", funding-text = "This work is supported in part by NSF under grants 1514433 and 1533842.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "bootstrapping; Computer architecture; Context; Decoupled look-ahead (DLA) architectures; decoupled look-ahead architectures; DLA architecture; multi-threading; multiprocessing systems; optimisation; optimized implementation; Prefetching; Resource management; simultaneous multi-threading (SMT); single core; single thread performance; single-thread performance; Skeleton; SMT hardware; Substrates", number-of-cited-references = "20", research-areas = "Computer Science", times-cited = "1", unique-id = "Kondguli:2018:BUS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kline:2018:CAR, author = "Donald {Kline, Jr.} and Rami Melhem and Alex K. Jones", title = "Counter Advance for Reliable Encryption in Phase Change Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "209--212", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2861012", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The use of hardware encryption and new memory technologies such as phase change memory (PCM) are gaining popularity in a variety of server applications such as cloud systems. While PCM provides energy and density advantages over conventional DRAM memory, it faces endurance challenges. Such challenges are exacerbated when employing memory encryption as the stored data is essentially randomized. losing data locality and reducing or eliminating the effectiveness of energy and endurance aware encoding techniques. This results in increasing dynamic energy consumption and accelerated wear out. In this paper we propose counter advance, a technique to leverage the process of encryption to improve reliability and lifetime while maintaining low-energy and low-latency operation. Counter advance is compatible with standard error-correction codes (ECC) and error correction pointers (ECP), the standard for mitigating endurance faults in PCM. Counter advance achieves the same fault tolerance using three ECP pointers for a 10(-4) cell failure rate compared to the leading approach to consider energy savings and reliability for encrypted PCM (SECRET) using five ECP pointers. At a failure rate of 10(-2), counter advance can achieve an uncorrectable bit error rate (UBER) of 10(-1), compared to < 10(-4) for SECRET using six ECP pointers. This leads to a lifetime improvement of 3.8x while maintaining comparable energy consumption and access latency.", acknowledgement = ack-nhfb, affiliation = "Kline, D (Reprint Author), Univ Pittsburgh, Dept Elect \& Comp Engn, Pittsburgh, PA 15260 USA. Kline, Donald, Jr.; Jones, Alex K., Univ Pittsburgh, Dept Elect \& Comp Engn, Pittsburgh, PA 15260 USA. Melhem, Rami, Univ Pittsburgh, Dept Comp Sci, Pittsburgh, PA 15260 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "dek61@pitt.edu melhem@cs.pitt.edu akjones@pitt.edu", da = "2019-06-20", doc-delivery-number = "HA2CO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [1747452]; IUCRC Program of the National Science Foundation [CNS-1738783]; SHREC", funding-text = "This work was supported by NSF Graduate Research Fellowship award number 1747452, and SHREC industry and agency members and by the IUCRC Program of the National Science Foundation (Grant No. CNS-1738783).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "and error correction; Computer architecture; Emerging memories; Encryption; error correction; Error correction; Memory management; Microprocessors; Phase change materials; reliability; stuck-at faults", number-of-cited-references = "16", oa = "Bronze", ORCID-numbers = "Kline, Jr, Donald/0000-0002-4414-1513", research-areas = "Computer Science", times-cited = "0", unique-id = "Kline:2018:CAR", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Sahoo:2018:RRD, author = "Debiprasanna Sahoo and Swaraj Sha and Manoranjan Satpathy and Madhu Mutyam", title = "{ReDRAM}: a Reconfigurable {DRAM} Cache for {GPGPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "213--216", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2865552", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Hardware-based DRAM cache techniques for GPGPUs propose to use GPU DRAM as a cache of the host (system) memory. However, these approaches do not exploit the opportunity of allocating store-before-load data (data that is written before being read by GPU cores) on GPU DRAM that would save multiple CPU-GPU transactions. In this context, we propose ReDRAM, a novel memory allocation strategy for GPGPUs which re-configures GPU DRAM cache as a heterogeneous unit. It allows allocation of store-before-load data directly onto GPU DRAM and also utilizes it as a cache of the host memory. Our simulation results using a modified version of GPGPU-Sim show that ReDRAM can improve performance for applications that use store-before-load data by 57.6 percent (avg.) and 4.85x (max.) when compared to the existing proposals on state-of-the-art GPU DRAM caches.", acknowledgement = ack-nhfb, affiliation = "Sahoo, D (Reprint Author), Indian Inst Technol Bhubaneswar, Bhubaneswar 751013, Odisha, India. Sahoo, Debiprasanna; Sha, Swaraj; Satpathy, Manoranjan, Indian Inst Technol Bhubaneswar, Bhubaneswar 751013, Odisha, India. Mutyam, Madhu, Indian Inst Technol Madras, Madras 600036, Tamil Nadu, India.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "debiprasanna.sahoo@gmail.com ss24@iitbbs.ac.in manoranjan@iitbbs.ac.in madhu@cse.iitm.ac.in", da = "2019-06-20", doc-delivery-number = "HA2CO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Arrays; cache storage; CPU-GPU communication; DRAM cache; DRAM chips; efficiency 57.6 percent; GPGPU; GPGPU-Sim; GPGPUs; GPU cores; GPU DRAM cache; graphics processing units; Graphics processing units; Hardware; hardware-based DRAM cache techniques; heterogeneous unit; host memory; memory allocation strategy; Memory management; multiple CPU-GPU transactions; Random access memory; reconfigurable DRAM cache; ReDRAM; resource allocation; Resource management; store-before-load; store-before-load data allocation; tagless", number-of-cited-references = "16", ORCID-numbers = "Mutyam, Madhu/0000-0003-1638-4195 Sahoo, Debiprasanna/0000-0003-1438-0617", research-areas = "Computer Science", researcherid-numbers = "Mutyam, Madhu/B-1717-2012", times-cited = "0", unique-id = "Sahoo:2018:RRD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Mashimo:2018:VMS, author = "Susumu Mashimo and Ryota Shioya and Koji Inoue", title = "{VMOR}: Microarchitectural Support for Operand Access in an Interpreter", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "217--220", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2866243", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Dynamic scripting languages become very popular for high productivity. However, many of these languages have significant runtime overheads because they employ interpreter-based virtual machines. One of the major overheads for the interpreter is derived from operand accesses, which significantly increase memory accesses. We propose VMOR, microarchitectural support for the operand accesses in the interpreter. VMOR remaps operand values into floating-point physical registers, which are rarely used in the interpreter, and thus. VMOR effectively reduces the memory accesses.", acknowledgement = ack-nhfb, affiliation = "Mashimo, S (Reprint Author), Kyushu Univ, Fukuoka, Fukuoka 8190395, Japan. Mashimo, Susumu; Inoue, Koji, Kyushu Univ, Fukuoka, Fukuoka 8190395, Japan. Shioya, Ryota, Nagoya Univ, Nagoya, Aichi 4648601, Japan.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "susumu.mashimo@cpc.ait.kyushu-u.ac.jp shioya@nuee.nagoya-u.ac.jp inoue@ait.kyushu-u.ac.jp", da = "2019-06-20", doc-delivery-number = "HA2CO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "JSPS KAKENHI [JP17J10388]", funding-text = "This work was supported by JSPS KAKENHI Grant Number JP17J10388.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "authoring languages; Cryptography; Dynamic scripting language; dynamic scripting languages; floating-point physical registers; Hardware; high productivity; interpreter; interpreter-based virtual machines; memory accesses; microarchitectural support; Microarchitecture; operand access; operand values; Pipelines; Productivity; program interpreters; Proposals; Registers; virtual machines; VMOR", number-of-cited-references = "10", research-areas = "Computer Science", times-cited = "0", unique-id = "Mashimo:2018:VMS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Min:2018:SCD, author = "Seungwon Min and Mohammad Alian and Wen-Mei Hwu and Nam Sung Kim", title = "Semi-Coherent {DMA}: an Alternative {I/O} Coherency Management for Embedded Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "221--224", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2866568", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Many modern embedded CPUs adopt Non-Coherent DMA (NC-DMA) over Coherent DMA (C-DMA) because of simplicity. An NC-DMA design, however, requires a CPU device driver to explicitly invalidate or flush a wide range of cache space. When an I/O DMA device writes data to a main memory region, the CPU needs to invalidate the cache space corresponding to the same memory region twice: (1) to prevent dirty cache lines from overwriting the DMA data and (2) to remove any cache lines prefetched before the DMA is done. In this work, we first show that such explicit invalidations consume 31 percent of CPU cycles, limiting the data transfer throughput of a high-speed network interface card (NIC) when receiving network packets. Second, we propose a Semi-Coherent DMA (SC-DMA) architecture for improving the efficiency of NC-DMA with a slight modification to the hardware. Specifically, our SC-DMA records the DMA region and prohibits any data that is prefetched from the region from entering the cache, reducing nearly 50 percent of the unnecessary invalidations. Lastly, we identify several software optimizations that can substantially reduce excessive cache invalidations prevalent in NIC drivers. Our evaluation with NVIDIA Jetson TX2 shows that our proposed SC-DMA design with the NIC driver optimizations can improve the NIC data transfer throughput by up to 53.3 percent.", acknowledgement = ack-nhfb, affiliation = "Kim, NS (Reprint Author), Univ Illinois, Elect \& Comp Engn, Urbana, IL 61820 USA. Min, Seungwon; Alian, Mohammad; Hwu, Wen-Mei; Kim, Nam Sung, Univ Illinois, Elect \& Comp Engn, Urbana, IL 61820 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "min16@illinois.edu malian2@illinois.edu w-hwu@illinois.edu nskim@illinois.edu", da = "2019-06-20", doc-delivery-number = "HA2CO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "SRC/JUMP Applications Driving Architectures (ADA) Research Center; IBM-ILLI-NOIS Center for Cognitive Computing Systems Research (C3SR)", funding-text = "This work is supported in part by grants from SRC/JUMP Applications Driving Architectures (ADA) Research Center and IBM-ILLI-NOIS Center for Cognitive Computing Systems Research (C3SR).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; C-DMA; Cache; cache lines; cache space; cache storage; coherency management; coherent DMA; Computer architecture; CPU cycles; CPU device driver; Data transfer; device drivers; Device drivers; DMA data; DMA device; DMA region; embedded CPUs; embedded processor; embedded systems; Embedded systems; embedded systems; Ethernet; excessive cache invalidations; Hardware; high-speed network interface card; Internet of Things; main memory region; microprocessor chips; multiprocessing systems; NC-DMA design; NIC data transfer throughput; noncoherent DMA; Prefetching; SC-DMA design; SC-DMA records; semicoherent DMA architecture", number-of-cited-references = "16", ORCID-numbers = "Min, Seung Won/0000-0001-7195-7182", research-areas = "Computer Science", times-cited = "0", unique-id = "Min:2018:SCD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Nematollahi:2018:NSD, author = "Negin Nematollahi and Mohammad Sadrosadati and Hajar Falahati and Marzieh Barkhordar and Hamid Sarbazi-Azad", title = "{Neda}: Supporting Direct Inter-Core Neighbor Data Exchange in {GPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "225--229", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2873679", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Image processing applications employ various fitters for several purposes, such as enhancing the images and extracting the features. Recent studies show that filters in image processing applications take a substantial amount of the execution time. and it is crucial to boost their performance to improve the overall performance of the image processing applications. Image processing filters require a significant amount of data sharing among threads which are in charge of filtering neighbor pixels. Graphics Processing Units (GPUs) attempt to satisfy the demand of data sharing by providing the scratch-pad memory, shuffle instructions, and on-chip caches. However, we observe that these mechanisms are insufficient to provide a fast and energy-efficient neighbor data sharing for the image processing filters. In this paper, we propose a new hardware/software co-design mechanism for GPUs, to effectively provide a fast and energy-efficient register-level neighbor data sharing for the image fitters. We propose a neighbor data exchange mechanism. called Neda, that adds a register to each streaming processor (SP) which can be accessed by its neighboring SPs. Our experimental results show that Neda improves the performance and energy consumption by 12.4 and 13.5 percent, on average, respectively, compared to the NVIDIA SDK implementation of image processing filters. Moreover, Neda's performance is within 9.3 percent of the ideal GPU with zero latency neighbor data exchange capability.", acknowledgement = ack-nhfb, affiliation = "Nematollahi, N (Reprint Author), Sharif Univ Technol, Dept Comp Engn, Tehran 111559517, Iran. Nematollahi, Negin; Sadrosadati, Mohammad; Barkhordar, Marzieh; Sarbazi-Azad, Hamid, Sharif Univ Technol, Dept Comp Engn, Tehran 111559517, Iran. Falahati, Hajar; Sarbazi-Azad, Hamid, Inst Res Fundamental Sci, Comp Sci Sch, Tehran 193955531, Iran.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "negin.mahani@gmail.com m.sadr89@gmail.com hfalahati@ipm.ir marzieh.barkhordar@gmail.com azad@sharif.edu", da = "2019-06-20", doc-delivery-number = "HA2CO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; direct inter-core neighbor data exchange mechanism; efficiency 13.5 percent; efficiency 9.3 percent; electronic data interchange; energy consumption; energy-efficient neighbor data sharing; energy-efficient register-level neighbor data sharing; fast energy-efficient neighbor data; feature extraction; GPUs; Graphics processing units; graphics processing units; hardware-software co-design mechanism; hardware-software codesign; image enhancement; image filtering; image filters; Image processing; image processing applications; image processing filters; Instruction sets; inter-core communication; Microsoft Windows; Neda; neighbor data exchange; NVIDIA SDK implementation; on-chip caches; Registers; scratch-pad memory; shuffle instructions; spatial image processing filters; streaming processor; Two dimensional displays; zero latency neighbor data exchange capability", keywords-plus = "MEAN FILTERS; IMAGE; DOMAIN", number-of-cited-references = "40", research-areas = "Computer Science", times-cited = "0", unique-id = "Nematollahi:2018:NSD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Omar:2018:MRI, author = "Hamza Omar and Halit Dogan and Brian Kahne and Omer Khan", title = "Multicore Resource Isolation for Deterministic, Resilient and Secure Concurrent Execution of Safety-Critical Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "230--234", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2874216", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Multicores increasingly deploy spatial execution of safety-critical applications that demand a deterministic, resilient, and secure environment to meet the safety standards. However, multicores aggressively share hardware resources that leads to non-deterministic performance due to destructive interference from concurrent applications. Resource sharing not only hinders efficient resilient execution, but also introduces security vulnerabilities due to information leakage on side-channels. This work proposes a novel multicore framework that constructs isolated clusters of cores for each concurrent application. It guarantees concurrent applications with deterministic performance, as well as an efficient execution environment for resiliency and security. Moreover, the framework allows dynamic re-sizing of cluster sizes for load balanced execution of concurrent applications. However, it leads to diminished isolation between clusters, which opens various performance-resilience and performance-security tradeoffs.", acknowledgement = ack-nhfb, affiliation = "Khan, O (Reprint Author), Univ Connecticut, Dept Elect \& Comp Engn, Storrs, CT 06269 USA. Omar, Hamza; Dogan, Halit; Khan, Omer, Univ Connecticut, Dept Elect \& Comp Engn, Storrs, CT 06269 USA. Kahne, Brian, NXP Semicond Inc, Automot Microcontrollers \& Processors, Austin, TX 78735 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "hamza.omar@uconn.edu halit.dogan@uconn.edu brian.kahne@nxp.com omer.khan@uconn.edu", da = "2019-06-20", doc-delivery-number = "HA2CO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Science Foundation [CCF-1550470, CNS-1718481]", funding-text = "This research was partially supported by the National Science Foundation under Grants No. CCF-1550470 and CNS-1718481.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "concurrency control; concurrent application; deterministic performance; Hardware; hardware resource sharing; hardware resources; Interference; load balanced execution; Multicore; multicore framework; Multicore processing; multicore resource isolation; multicores; multiprocessing systems; nondeterministic performance; Program processors; resilience; Resilience; resilience; resource allocation; safety-critical applications; safety-critical systems; secure environment; security; Security; security; security of data; security vulnerabilities; side-channels; spatial execution; System-on-chip", number-of-cited-references = "20", ORCID-numbers = "Khan, Omer/0000-0001-6293-7403", research-areas = "Computer Science", times-cited = "0", unique-id = "Omar:2018:MRI", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Zokaee:2018:APM, author = "Farzaneh Zokaee and Hamid R. Zarandi and Lei Jiang", title = "{AligneR}: a Process-in-Memory Architecture for Short Read Alignment in {ReRAMs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "235--238", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2854700", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Genomics is the key to enable the personal customization of medical care. How to fast and energy-efficiently analyze the huge amounts of genomic sequence data generated by next generation sequencing technologies has become one of the most significant challenges facing genomics today. Existing hardware platforms achieve low genome sequencing throughput with significant hardware and power overhead. In this paper, we propose AligneR, a ReRAM-based process-in-memory architecture, to accelerate the bottleneck of genome sequencing, i.e., short read alignment. Compared to state-of-the-art accelerators, AligneR improves the short read alignment throughput per Watt per mm(2) by 13x.", acknowledgement = ack-nhfb, affiliation = "Zokaee, F (Reprint Author), Indiana Univ, Bloomington, IN 47405 USA. Zokaee, Farzaneh; Jiang, Lei, Indiana Univ, Bloomington, IN 47405 USA. Zokaee, Farzaneh; Zarandi, Hamid R., Amirkabir Univ Technol, Tehran 158754413, Iran.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "f\_zokaee@aut.ac.ir h\_zarandi@aut.ac.ir jiang60@iu.edu", da = "2019-06-20", doc-delivery-number = "HE6YC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bioinformatics; Computer architecture; FM-index; Genome sequencing; Genomics; Memory management; Microprocessors; process-in-memory; Random access memory; ReRAM; Sequential analysis; short read alignment; Throughput", number-of-cited-references = "19", research-areas = "Computer Science", times-cited = "0", unique-id = "Zokaee:2018:APM", web-of-science-categories = "Computer Science, Hardware \& Architecture", xxpages = "237--240", } @Article{Lou:2018:BSB, author = "Qian Lou and Lei Jiang", title = "{BRAWL}: a Spintronics-Based Portable Basecalling-in-Memory Architecture for Nanopore Genome Sequencing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "239--242", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2882384", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Nanopore sequencing is one of the most promising genome sequencing technologies because of its ability to produce ultra long reads and provide portability. Basecalling, the most time-consuming step in the whole flow of Nanopore genome sequencing, translates analog signals to digital DNA symbols. The state-of-the-art basecaller relies on a complex neural network consisting of convolutional, long short-term memory and fully-connected layers, and a CTC decoder. Existing neural network portable accelerators achieve low basecalling throughput per Watt when processing such neural network inferences. In this paper, we propose BRAWL, a portable Basecalling-in-memory architecture, to translate RAW electrical signals to digital DNA symbols in SOT-MRAMs for Nanopore portable sequencers. Compared to state-of-the-art accelerators, BRAWL improves basecalling throughput per Watt by 3: 88x.", acknowledgement = ack-nhfb, affiliation = "Jiang, L (Reprint Author), Indiana Univ, Bloomington, IN 47405 USA. Lou, Qian; Jiang, Lei, Indiana Univ, Bloomington, IN 47405 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "louqian@iu.edu jiang60@iu.edu", da = "2019-06-20", doc-delivery-number = "HE6YC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Artificial neural networks; basecaller; Bioinformatics; Computer architecture; DNA; genome sequencing; Genomics; Microprocessors; Oxford nanopore technology; process-in-memory; Sequential analysis; SOT-MRAM", keywords-plus = "PERFORMANCE; ENERGY", number-of-cited-references = "26", research-areas = "Computer Science", times-cited = "0", unique-id = "Lou:2018:BSB", web-of-science-categories = "Computer Science, Hardware \& Architecture", xxpages = "241--244", } @Article{Min:2018:AAB, author = "Donghyun Min and Donggyu Park and Jinwoo Ahn and Ryan Walker and Junghee Lee and Sungyong Park and Youngjae Kim", title = "{Amoeba}: an Autonomous Backup and Recovery {SSD} for Ransomware Attack Defense", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "243--246", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2883431", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Ransomware is one of growing concerns in enterprise and government organizations, because it may cause financial damages or loss of important data. Although there are techniques to detect and prevent ransomware, an evolved ransomware may evade them because they are based on monitoring known behaviors. Ransomware can be mitigated if backup copies of data are retained in a safe place. However, existing backup solutions may be under ransomware's control and an intelligent ransomware may destroy backup copies too. They also incur overhead to storage space, performance and network traffic (in case of remote backup). In this paper, we propose an SSD system that supports automated backup, called Amoeba. In particular, Amoeba is armed with a hardware accelerator that can detect the infection of pages by ransomware attacks at high speed and a fine-grained backup control mechanism to minimize space overhead for original data backup. For evaluation, we extended the Microsoft SSD simulator to implement Amoeba and evaluated it using the realistic block-level traces, which are collected while running the actual ransomware. According to our experiments, Amoeba has negligible overhead and outperforms in performance and space efficiency over the state-of-the-art SSD, FlashGuard, which supports data backup within the device.", acknowledgement = ack-nhfb, affiliation = "Kim, Y (Reprint Author), Sogang Univ, Seoul 04107, South Korea. Min, Donghyun; Park, Donggyu; Ahn, Jinwoo; Park, Sungyong; Kim, Youngjae, Sogang Univ, Seoul 04107, South Korea. Walker, Ryan; Lee, Junghee, Univ Texas San Antonio, San Antonio, TX 78249 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "mdh38112@sogang.ac.kr dgpark@sogang.ac.kr jinu37@sogang.ac.kr ryan.walker@utsa.edu junghee.lee@utsa.edu parksy@sogang.ac.kr youkim@sogang.ac.kr", da = "2019-06-20", doc-delivery-number = "HE6YC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Research Foundation of Korea (NRF) --- Korea Government (MSIT) [NRF-2018R1A1A1A05079398]", funding-text = "This work was supported by the National Research Foundation of Korea (NRF) grant funded by the Korea Government (MSIT) (No. NRF-2018R1A1A1A05079398).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Aerospace electronics; Amoeba; autonomous backup SSD; autonomous recovery SSD; back-up procedures; cryptography; Cryptography; data backup; Entropy; FlashGuard; intelligent ransomware; invasive software; Microsoft SSD simulator; Performance evaluation; Ransomware; ransomware attack; ransomware attack defense; Solid-state drive (SSD); SSD system; storage management; storage security", number-of-cited-references = "12", ORCID-numbers = "Park, Sungyong/0000-0002-0309-1820 Min, Donghyun/0000-0002-6043-9264", research-areas = "Computer Science", times-cited = "0", unique-id = "Min:2018:AAB", web-of-science-categories = "Computer Science, Hardware \& Architecture", xxpages = "245--248", } @Article{Kim:2018:HBP, author = "Chinam Kim and Hyukjun Lee", title = "A High-Bandwidth {PCM}-Based Memory System for Highly Available {IP} Routing Table Lookup", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "246--249", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2883461", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Achieving higher availability is an unending challenge in router architecture, as process technology scales down and more random logic/memory errors must be tolerated. However, meeting extremely high targets that require only few seconds of yearly downtime puts even more pressure on the design of already complex router architecture. In this paper, we explore the case of storing the routing table in non-volatile memory, to drastically reduce the router downtime and achieve higher availability-without degrading lookup performance. We propose a new MLC PCM architecture, featuring decoupled node access and logically managed duplicate bank groups, that fetches the right amount of information from the most available bank. Performance evaluation shows that we achieve an average of 9.9 percent bandwidth improvement over the DRAM baseline system, and an 83.9 percent over the PCM baseline.", acknowledgement = ack-nhfb, affiliation = "Lee, H (Reprint Author), Sogang Univ, Dept Comp Sci \& Engn, Seoul 04107, South Korea. Kim, Chinam; Lee, Hyukjun, Sogang Univ, Dept Comp Sci \& Engn, Seoul 04107, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "chinamkim@sogang.ac.kr hyukjunl@sogang.ac.kr", da = "2019-06-20", doc-delivery-number = "HE6YC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Samsung Electronics", funding-text = "This research is funded by Samsung Electronics. The corresponding author is Hyukjun Lee.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; complex router architecture; decoupled node access; DRAM baseline system; DRAM chips; duplicate bank groups; high-bandwidth PCM-based memory system; highly available IP routing table lookup; IP networks; IP routing table lookup; MLC PCM architecture; Network architecture; nonvolatile memory; PCM baseline; Phase change materials; phase change memories; Phase change memory; process technology; processing-in-memory; Random access memory; random logic errors; random memory errors; router downtime reduction; Routing; table lookup; Table lookup; telecommunication network routing", number-of-cited-references = "13", ORCID-numbers = "Kim, Chinam/0000-0002-7984-2643", research-areas = "Computer Science", times-cited = "0", unique-id = "Kim:2018:HBP", web-of-science-categories = "Computer Science, Hardware \& Architecture", xxpages = "247--250", } @Article{Kim:2019:IGM, author = "Jiho Kim and Jehee Cha and Jason Jong Kyu Park and Dongsuk Jeon and Yongjun Park", title = "Improving {GPU} Multitasking Efficiency Using Dynamic Resource Sharing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "1--5", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2889042", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "As GPUs have become essential components for embedded computing systems, a shared GPU with multiple CPU cores needs to efficiently support concurrent execution of multiple different applications. Spatial multitasking, which assigns a different amount of streaming multiprocessors (SMs) to multiple applications, is one of the most common solutions for this. However, this is not a panacea for maximizing total resource utilization. It is because an SM consists of many different sub-resources such as caches, execution units and scheduling units, and the requirements of the sub-resources per kernel are not well matched to their fixed sizes inside an SM. To solve the resource requirement mismatch problem, this paper proposes a GPU Weaver, a dynamic sub-resource management system of multitasking GPUs. GPU Weaver can maximize sub-resource utilization through a shared resource controller (SRC) that is added between neighboring SMs. The SRC dynamically identifies idle sub-resources of an SM and allows them to be used by the neighboring SM when possible. Experiments show that the combination of multiple sub-resource borrowing techniques enhances the total throughput by up to 26 and 9.5 percent on average over the baseline spatial multitasking GPU.", acknowledgement = ack-nhfb, affiliation = "Park, Y (Reprint Author), Hanyang Univ, Seoul 04763, South Korea. Kim, Jiho; Cha, Jehee, Hongik Univ, Seoul 04066, South Korea. Park, Jason Jong Kyu, Univ Michigan, Ann Arbor, MI 48109 USA. Jeon, Dongsuk, Seoul Natl Univ, Seoul 151742, South Korea. Park, Yongjun, Hanyang Univ, Seoul 04763, South Korea.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "jihokimhi@gmail.com carjehee@gmail.com jasonjk@umich.edu djeon1@snu.ac.kr yongjunpark@hanyang.ac.kr", da = "2019-06-20", doc-delivery-number = "HI0TZ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Research Foundation of Korea (NRF) --- Korea government (MSIP) [NRF-2015R1C1A1A01053844, NRF-2016R1C1B2016072]; ICT R\&D program of MSIP/IITP [2017-0-00142]; R\&D program of MOTIE/KEIT [10077609]", funding-text = "This work was supported in part by the National Research Foundation of Korea (NRF) grant funded by the Korea government (MSIP) (NO. NRF-2015R1C1A1A01053844, NO. NRF-2016R1C1B2016072), ICT R\&D program of MSIP/IITP (No. 2017-0-00142), and the R\&D program of MOTIE/KEIT (No. 10077609).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; dynamic resource sharing; dynamic sub-resource management system; embedded computing systems; embedded systems; GPU multitasking efficiency; GPU Weaver; GPUs; graphics processing units; Graphics processing units; Instruction sets; Kernel; Micromechanical devices; multi-programmed; multiple CPU cores; multiple sub-resource borrowing techniques; multiprogramming; Multitasking; multitasking GPUs; resource allocation; Resource management; resource requirement mismatch problem; resource sharing; scheduling; scheduling units; shared GPU; shared resource controller; spatial multitasking; SRC; streaming multiprocessors; sub-resource utilization; total resource utilization; Weaving", number-of-cited-references = "19", research-areas = "Computer Science", times-cited = "0", unique-id = "Kim:2019:IGM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Anonymous:2019:IIC, author = "Anonymous", title = "2018 Index {{\booktitle{IEEE Computer Architecture Letters}}} Vol. 17", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "1--8", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2901240", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Presents the 2018 subject/author index for this publication.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Xu:2019:PFD, author = "Sheng Xu and Xiaoming Chen and Ying Wang and Yinhe Han and Xuehai Qian and Xiaowei Li", title = "{PIMSim}: a Flexible and Detailed Processing-in-Memory Simulator", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "6--9", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2885752", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "With the advent of big data applications and new process technologies, Process-in-Memory (PIM) attracts much attention in memory research as the architecture studies gradually shift from processors to heterogeneous aspects. How to achieve reliable and efficient PIM architecture modeling becomes increasingly urgent for the researchers, who want to experiment on critical issues from detailed implementations of their proposed PIM designs. This paper proposes PIMSim, a full-system and highly-configurable PIM simulator to facilitate circuit-, architecture- and system-level researches. PIMSim enables architectural simulation of PIM and implements three simulation modes to provide a wide range of speed/accuracy tradeoffs. It offers detailed performance and energy models to simulate PIM-enabled instructions, compiler, in-memory processing logic, various memory devices, and PIM coherence. PIMSim is open source and available at https://github.com/vineodd/PIMSim.", acknowledgement = ack-nhfb, affiliation = "Xu, S (Reprint Author), Chinese Acad Sci, Inst Comp Technol, Beijing, Peoples R China. Xu, Sheng; Chen, Xiaoming; Wang, Ying; Han, Yinhe; Li, Xiaowei, Chinese Acad Sci, Inst Comp Technol, Beijing, Peoples R China. Xu, Sheng; Li, Xiaowei, Univ Chinese Acad Sci, Beijing 101408, Peoples R China. Qian, Xuehai, Univ Southern Calif, Los Angeles, CA 90007 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "xusheng02@ict.ac.cn chenxiaoming@ict.ac.cn wangying2009@ict.ac.cn yinhes@ict.ac.cn xuehai.qian@usc.edu lxw@ict.ac.cn", da = "2019-06-20", doc-delivery-number = "HI0TZ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Natural Science Foundation of China (NSFC) [61522406, 61834006, 61521092]; Beijing Municipal Science \& Technology Commission [Z171100000117019, Z181100008918006]; Strategic Priority Research Program of the Chinese Academy of Sciences [XDPB12]; Innovative Project of Institute of Computing Technology, CAS [5120186140]", funding-text = "This work was supported in part by National Natural Science Foundation of China (NSFC) under grants 61522406, 61834006, and 61521092, Beijing Municipal Science \& Technology Commission (Z171100000117019, Z181100008918006), Strategic Priority Research Program of the Chinese Academy of Sciences (XDPB12), and an Innovative Project of Institute of Computing Technology, CAS, under Grant 5120186140.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "architectural simulation; Big Data; big data applications; Coherence; Computational modeling; Computer architecture; Data models; energy models; heterogeneous aspects; heterogeneous computing; in-memory processing logic; Kernel; memory architecture; memory devices; memory research; memory system; performance evaluation; PIM coherence; PIM designs; PIM simulator; PIM-enabled instructions; PIMSim; Process-in-Memory; Processing-in-memory; processing-in-memory simulator; Program processors; reliable PIM architecture modeling; simulation modes; simulator; system-level researches; Tools", number-of-cited-references = "22", ORCID-numbers = "Wang, Ying/0000-0001-5172-4736", research-areas = "Computer Science", times-cited = "0", unique-id = "Xu:2019:PFD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Shomron:2019:SCV, author = "Gil Shomron and Uri Weiser", title = "Spatial Correlation and Value Prediction in Convolutional Neural Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "10--13", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2890236", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Convolutional neural networks (CNNs) are a widely used form of deep neural networks, introducing state-of-the-art results for different problems such as image classification, computer vision tasks, and speech recognition. However, CNNs are compute intensive, requiring billions of multiply-accumulate (MAC) operations per input. To reduce the number of MACs in CNNs, we propose a value prediction method that exploits the spatial correlation of zero-valued activations within the CNN output feature maps, thereby saving convolution operations. Our method reduces the number of MAC operations by 30.4 percent, averaged on three modern CNNs for ImageNet, with top-1 accuracy degradation of 1.7 percent, and top-5 accuracy degradation of 1.1 percent.", acknowledgement = ack-nhfb, affiliation = "Shomron, G (Reprint Author), Technion Israel Inst Technol, IL-3200003 Haifa, Israel. Shomron, Gil; Weiser, Uri, Technion Israel Inst Technol, IL-3200003 Haifa, Israel.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "gilsho@tx.technion.ac.il uri.weiser@ee.technion.ac.il", da = "2019-06-20", doc-delivery-number = "HI0TZ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "CNNs; computer vision; computer vision tasks; Convolution; convolutional neural nets; convolutional neural network; convolutional neural networks; Correlation; Deep neural networks; deep neural networks; Degradation; image classification; ImageNet; learning (artificial intelligence); MAC operations; Microsoft Windows; multiply-accumulate operations; Neural networks; Predictive models; spatial correlation; speech recognition; value prediction; value prediction method; zero-valued activations", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "0", unique-id = "Shomron:2019:SCV", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Gupta:2019:DQL, author = "Ujjwal Gupta and Sumit K. Mandal and Manqing Mao and Chaitali Chakrabarti and Umit Y. Ogras", title = "A Deep {Q}-Learning Approach for Dynamic Management of Heterogeneous Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "14--17", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2892151", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Heterogeneous multiprocessor system-on-chips (SoCs) provide a wide range of parameters that can be managed dynamically. For example, one can control the type (big/little), number and frequency of active cores in state-of-the-art mobile processors at runtime. These runtime choices lead to more than 10$ \times $ range in execution time, 5$ \times $ range in power consumption, and 50$ \times $ range in performance per watt. Therefore, it is crucial to make optimum power management decisions as a function of dynamically varying workloads at runtime. This paper presents a reinforcement learning approach for dynamically controlling the number and frequency of active big and little cores in mobile processors. We propose an efficient deep Q-learning methodology to optimize the performance per watt (PPW). Experiments using Odroid XU3 mobile platform show that the PPW achieved by the proposed approach is within 1 percent of the optimal value obtained by an oracle.", acknowledgement = ack-nhfb, affiliation = "Mandal, SK (Reprint Author), Arizona State Univ, Sch Elect Comp \& Energy Engn, Tempe, AZ 85281 USA. Gupta, Ujjwal; Mandal, Sumit K.; Mao, Manqing; Chakrabarti, Chaitali; Ogras, Umit Y., Arizona State Univ, Sch Elect Comp \& Energy Engn, Tempe, AZ 85281 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "ujjwal@asu.edu skmandal@asu.edu mmao7@asu.edu chaitali@asu.edu umit@asu.edu", da = "2019-06-20", doc-delivery-number = "HL5MF", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [CNS-1526562]; Semiconductor Research Corp. [2721.001]", funding-text = "This work was supported by NSF grant CNS-1526562 and Semiconductor Research Corp. task 2721.001.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "active cores; deep Q-learning approach; Deep reinforcement learning; dynamic management; execution time; Frequency control; Heterogeneous multi-cores; heterogeneous processors; Instruments; learning (artificial intelligence); Memory management; mobile computing; mobile processors; multiprocessing systems; multiprocessor system-on-chips; Odroid XU3 mobile platform show; optimum power management decisions; power aware computing; power consumption; Power demand; Power management; Power system management; PPW; reinforcement learning approach; Runtime; SoCs; system-on-chip; Training", number-of-cited-references = "15", research-areas = "Computer Science", times-cited = "0", unique-id = "Gupta:2019:DQL", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Rogers:2019:SLB, author = "Samuel Rogers and Joshua Slycord and Ronak Raheja and Hamed Tabkhi", title = "Scalable {LLVM}-Based Accelerator Modeling in gem5", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "18--21", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2893932", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/python.bib", abstract = "This article proposes a scalable integrated system architecture modeling for hardware accelerator based in gem5 simulation framework. The core of proposed modeling is a LLVM-based simulation engine for modeling any customized data-path with respect to inherent data/instruction-level parallelism (derived by algorithms) and available compute units (defined by the user). The simulation framework also offers a general-purpose communication interface that allows a scalable and flexible connection into the gem5 ecosystem. Python API of gem5, enabling modifications to the system hierarchy without the need to rebuild the underlying simulator. Our simulation framework currently supports full-system simulation (both bare-metal and a full Linux kernel) for ARM-based systems, with future plans to add support for RISC-V. The LLVM-based modeling and modular integration to gem5 allow long-term simulation expansion and sustainable design modeling for emerging applications with demands for acceleration.", acknowledgement = ack-nhfb, affiliation = "Rogers, S (Reprint Author), Univ Noth Carolina, Dept Elect \& Comp Engn, Charlotte, NC 28223 USA. Rogers, Samuel; Slycord, Joshua; Raheja, Ronak; Tabkhi, Hamed, Univ Noth Carolina, Dept Elect \& Comp Engn, Charlotte, NC 28223 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "sroger48@uncc.edu jslycord@uncc.edu rraheja@uncc.edu htabkhiv@uncc.edu", da = "2019-06-20", doc-delivery-number = "HL5MF", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "application program interfaces; ARM-based systems; Computational modeling; Computer architecture simulation; customized data-path; Engines; field programmable gate arrays; flexible connection; full-system simulation; gem5 ecosystem; gem5 simulation framework; general-purpose communication interface; Hardware; hardware accelerator; hardware accelerators; heterogeneous systems; inherent data; instruction-level parallelism; Linux; LLVM-based modeling; LLVM-based simulation engine; logic design; long-term simulation expansion; microprocessor chips; multiprocessing systems; parallel architectures; parallel programming; program compilers; reduced instruction set computing; Registers; RISC-V; Runtime; scalable connection; scalable integrated system architecture modeling; scalable LLVM-based accelerator modeling; Space exploration; sustainable design modeling; Synchronization; system hierarchy", number-of-cited-references = "11", ORCID-numbers = "Slycord, Joshua/0000-0002-0569-4094 Rogers, Samuel/0000-0002-9697-2933", research-areas = "Computer Science", times-cited = "0", unique-id = "Rogers:2019:SLB", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Akin:2019:CAP, author = "Berkin Akin and Alaa R. Alameldeen", title = "A Case For Asymmetric Processing in Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "22--25", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2894800", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "By sidestepping the limitations at the memory interface, processing-in-memory (PIM) unlocks internally available memory bandwidth to the compute units on the memory side. This abundant bandwidth is conventionally utilized by highly-parallel throughput-oriented many-core style PIM architectures via offloading bandwidth-bound parallel tasks. However, it can be difficult to fully isolate these PIM-suitable tasks, and an offloaded program may include compute-bound sequential phases. These PIM-averse phases constitute a critical performance bottleneck for conventional many-core style PIM architectures. In this paper, we propose an analytical model for PIM execution that considers a program's bandwidth demand as well as its parallelism. Based on the proposed model, we make a case for an asymmetric PIM architecture that can mitigate the performance bottlenecks for PIM-averse phases while keeping the performance upside for PIM-suitable phases.", acknowledgement = ack-nhfb, affiliation = "Akin, B (Reprint Author), Intel Labs, Hillsboro, OR 97124 USA. Akin, Berkin; Alameldeen, Alaa R., Intel Labs, Hillsboro, OR 97124 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "berkin.akin@intel.com alaa.r.alameldeen@intel.com", da = "2019-06-20", doc-delivery-number = "HL5MF", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Analytical models; analytical performance model; asymmetric multicore; asymmetric PIM architecture; asymmetric processing; Bandwidth; bandwidth-bound parallel tasks; Computational modeling; compute-bound sequential phases; critical performance bottleneck; memory bandwidth; memory interface; microprocessor chips; Multicore processing; multiprocessing systems; parallel processing; performance evaluation; PIM execution; PIM-averse phases; PIM-suitable tasks; Processing in memory; processing-in-memory; Silicon; Task analysis; throughput-oriented many-core style PIM", keywords-plus = "AMDAHLS LAW", number-of-cited-references = "9", ORCID-numbers = "Akin, Berkin/0000-0001-6908-5581", research-areas = "Computer Science", times-cited = "0", unique-id = "Akin:2019:CAP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Tovletoglou:2019:SIH, author = "Konstantinos Tovletoglou and Lev Mukhanov and Dimitrios S. Nikolopoulos and Georgios Karakonstantis", title = "{Shimmer}: Implementing a Heterogeneous-Reliability {DRAM} Framework on a Commodity Server", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "26--29", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2893189", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "In this paper, we present the implementation of a heterogeneous-reliability DRAM framework, Shimmer, on a commodity server with a fully fledged OS. Shimmer enables splitting of DRAM into multiple domains with varying reliability and allocation of data depending on their criticality. Compared to existing studies which use simulators, we consider practical restrictions stemming from the real hardware and investigate methods to overcome them. In particular, we reveal that the implementation of the heterogeneous-reliability memory framework requires disabling of the hardware memory interleaving, which results in a significant degradation of the system performance. To overcome the induced performance loss, we develop a software-based interleaving. We evaluate the performance, power and energy of the server using 35 benchmarks across three memory configurations: the baseline configuration; with disabled hardware memory interleaving and Shimmer with software-based memory interleaving. Our results show that Shimmer introduces a minor 6\% performance overhead, while reducing the average DRAM power by 19.9\% when memory operates under relaxed refresh rate and lowered memory supply voltage. As one of our main contributions we demonstrate that a heterogeneous-reliability framework based on Shimmer can be realized on a commodity server and save 9.1\% of the total processor and memory energy.", acknowledgement = ack-nhfb, affiliation = "Tovletoglou, K (Reprint Author), Queens Univ Belfast, Belfast BT7 1NN, Antrim, North Ireland. Tovletoglou, Konstantinos; Mukhanov, Lev; Nikolopoulos, Dimitrios S.; Karakonstantis, Georgios, Queens Univ Belfast, Belfast BT7 1NN, Antrim, North Ireland.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "ktovletoglou01@qub.ac.uk l.mukhanov@qub.ac.uk d.nikolopoulos@qub.ac.uk g.karakonstantis@qub.ac.uk", da = "2019-06-20", doc-delivery-number = "HL5WL", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "European Union [688540]", funding-text = "This work is funded by the H2020 Programme of the European Union under grant no. 688540 (the UniServer Project).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "average DRAM power; Bandwidth; commodity server; critical data; disabled hardware memory interleaving; DRAM; DRAM chips; energy saving; Hardware; heterogeneous-reliability DRAM framework; heterogeneous-reliability memory; heterogeneous-reliability memory framework; induced performance loss; integrated circuit reliability; interleaved storage; lowered memory supply voltage; memory configurations; memory interleaving; Memory management; Power efficiency; Random access memory; reliability; Reliability; reliability; Reliability; reliability; Resource management; Servers; Shimmer; software-based interleaving; software-based memory interleaving", number-of-cited-references = "17", ORCID-numbers = "Nikolopoulos, Dimitrios/0000-0003-0217-8307 Tovletoglou, Konstantinos/0000-0002-1513-3143", research-areas = "Computer Science", times-cited = "0", unique-id = "Tovletoglou:2019:SIH", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kumar:2019:HRA, author = "Chanchal Kumar and Sidharth Singh and Gregory T. Byrd", title = "Hybrid Remote Access Protocol", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "30--33", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2896116", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The invalidation-based cache coherence protocols used in current CMPs result in inefficient utilization of cache hierarchy in the presence of heavy sharing, since a significant percentage of shared cached data is invalidated soon after it is brought into the private cache. This work presents an analysis of a shared memory cache coherence protocol; based on novel insights from the analysis, we advocate direct remote reads/writes at the shared last-level cache for heavily contended data. Evaluation of our proposed protocol with the Splash2x kernels shows 17 percent geometric mean speedup over traditional MESI coherence and 8.5 percent better performance than prior remote-access proposals.", acknowledgement = ack-nhfb, affiliation = "Kumar, C (Reprint Author), North Carolina State Univ, Raleigh, NC 27695 USA. Kumar, Chanchal; Byrd, Gregory T., North Carolina State Univ, Raleigh, NC 27695 USA. Singh, Sidharth, North Carolina State Univ, Apple Inc, Raleigh, NC 27695 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "ckumar2@ncsu.edu sssingh4@ncsu.edu gbyrd@ncsu.edu", da = "2019-06-20", doc-delivery-number = "HL5WL", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Access protocols; Benchmark testing; cache hierarchy; cache storage; CMPs; Coherence; direct remote reads; direct remote writes; geometric mean speedup; Hardware; hybrid remote access protocol; invalidation-based cache coherence protocols; Kernel; memory hierarchy; MESI coherence; microprocessor chips; multi-core/single-chip multiprocessors; Parallel architectures; private cache; Proposals; protocols; shared cached data; shared last-level cache; shared memory cache coherence protocol; shared memory systems; Splash2x kernels", number-of-cited-references = "10", ORCID-numbers = "Byrd, Gregory/0000-0003-3647-8738", research-areas = "Computer Science", times-cited = "0", unique-id = "Kumar:2019:HRA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Wang:2019:DDD, author = "Yicheng Wang and Yang Liu and Peiyun Wu and Zhao Zhang", title = "Detect {DRAM} Disturbance Error by Using Disturbance Bin Counters", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "34--37", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2897299", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "DRAM disturbance errors are increasingly a concern to computer system reliability and security. There have been a number of designs to detect and prevent them; however, there lacks any design that guarantees 100 percent detection (no false negative) with a small and fixed hardware cost. This paper presents such a design based on a novel idea called disturbance bin counter (DBC). Each DBC is a complex counter that maintains an upper bound of disturbances for a bin of DRAM rows. Their access is not in the critical path of processor execution and thus incurs no performance overhead. The design is optimized at the circuit level to minimize the storage requirement. Our simulation results using multi-core SPEC CPU2006 workloads show that no false positive occurs with a 1,024-entry DBC table, which requires only 4.5 KB storage. The design can be incorporated into a memory controller to guarantee the detection of DRAM disturbance errors or row hammering by malicious programs.", acknowledgement = ack-nhfb, affiliation = "Wang, YC (Reprint Author), Univ Illinois, Chicago, IL 60607 USA. Wang, Yicheng; Liu, Yang; Wu, Peiyun; Zhang, Zhao, Univ Illinois, Chicago, IL 60607 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "ywang271@uic.edu yliu327@uic.edu pwu27@uic.edu zhangz@uic.edu", da = "2019-06-20", doc-delivery-number = "HL5WL", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "US National Science Foundation [CCF-1618104, CCF-1643271]", funding-text = "The authors appreciate the constructive comments from the anonymous reviewers. This work is supported in part by the US National Science Foundation under grants CCF-1618104 and CCF-1643271.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "complex counter; Computer architecture; computer system reliability; counting circuits; DBC table; disturbance bin counter; DRAM; DRAM chips; DRAM disturbance errors; DRAM rows; fixed hardware cost; Hash functions; Indexes; malicious programs; memory size 4.5 KByte; Microprocessors; Random access memory; reliability; row-hammering; Transistors; Upper bound", number-of-cited-references = "10", ORCID-numbers = "Wu, Peiyun/0000-0001-5675-6454 Liu, Yang/0000-0002-7377-1418 Wang, Yicheng/0000-0003-1079-5591", research-areas = "Computer Science", times-cited = "0", unique-id = "Wang:2019:DDD", web-of-science-categories = "Computer Science, Hardware \& Architecture", xxpages = "35--38", } @Article{Xie:2019:NXB, author = "Xinfeng Xie and Xing Hu and Peng Gu and Shuangchen Li and Yu Ji and Yuan Xie", title = "{NNBench-X}: Benchmarking and Understanding Neural Network Workloads for Accelerator Designs", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "38--42", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2898196", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The tremendous impact of deep learning algorithms over a wide range of application domains has encouraged a surge of neural network (NN) accelerator research. An evolving benchmark suite and its associated benchmark method are needed to incorporate emerging NN models and characterize NN workloads. In this paper, we propose a novel approach to understand the performance characteristic of NN workloads for accelerator designs. Our approach takes as input an application candidate pool and conducts an operator-level analysis and application-level analysis to understand the performance characteristics of both basic tensor primitives and whole applications. We conduct a case study on the TensorFlow model zoo by using this proposed characterization method. We find that tensor operators with the same functionality can have very different performance characteristics under different input sizes, while operators with different functionality can have similar characteristics. Additionally, we observe that without operator-level analysis, the application bottleneck is mischaracterized for 15 out of 57 models from the TensorFlow model zoo. Overall, our characterization method helps users select representative applications out of the large pool of possible applications, while providing insightful guidelines for the design of NN accelerators.", acknowledgement = ack-nhfb, affiliation = "Xie, XF (Reprint Author), Univ Calif Santa Barbara, Santa Barbara, CA 93106 USA. Xie, Xinfeng; Hu, Xing; Gu, Peng; Li, Shuangchen; Ji, Yu; Xie, Yuan, Univ Calif Santa Barbara, Santa Barbara, CA 93106 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "xinfeng@ucsb.edu xinghu@ucsb.edu peng\_gu@umail.ucsb.edu shuangchenli@ece.ucsb.edu maple.jiyu@hotmail.com yuanxie@ucsb.edu", da = "2019-06-20", doc-delivery-number = "HQ4FG", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "US National Science Foundation [1500848/172544/1730309]; CRISP--DARPA", funding-text = "This work was supported in part by US National Science Foundation 1500848/172544/1730309 and by CRISP, one of six centers in JUMP, a Semiconductor Research Corporation program sponsored by DARPA.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "accelerator designs; application-level analysis; Artificial neural networks; benchmark; benchmark method; benchmark testing; Benchmark testing; characterization method; deep learning algorithms; Feature extraction; Hardware; learning (artificial intelligence); Measurement; neural nets; Neural network; neural network accelerator research; neural network workloads; NN accelerators; NN workloads; NNBench-X; operator-level analysis; Parallel processing; performance characteristic; tensor operators; TensorFlow model zoo; workload characterization", number-of-cited-references = "22", research-areas = "Computer Science", times-cited = "0", unique-id = "Xie:2019:NXB", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Khan:2019:RCA, author = "Asif Ali Khan and Fazal Hameed and Robin Bl{\"a}sing and Stuart Parkin and Jeronimo Castrillon", title = "{RTSim}: a Cycle-Accurate Simulator for Racetrack Memories", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "43--46", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2899306", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Racetrack memories (RTMs) have drawn considerable attention from computer architects of late. Owing to the ultra-high capacity and comparable access latency to SRAM, RTMs are promising candidates to revolutionize the memory subsystem. In order to evaluate their performance and suitability at various levels in the memory hierarchy, it is crucial to have RTM-specific simulation tools that accurately model their behavior and enable exhaustive design space exploration. To this end, we propose RTSim, an open source cycle-accurate memory simulator that enables performance evaluation of the domain-wall-based racetrack memories. The skyrmions-based RTMs can also be modeled with RTSim because they are architecturally similar to domain-wall-based RTMs. RTSim is developed in collaboration with physicists and computer scientists. It accurately models RTM-specific shift operations, access ports management and the sequence of memory commands beside handling the routine read/write operations. RTSim is built on top of NVMain2.0. offering larger design space for exploration.", acknowledgement = ack-nhfb, affiliation = "Khan, AA (Reprint Author), Tech Univ Dresden, Chair Compiler Construct, D-01069 Dresden, Germany. Khan, Asif Ali; Hameed, Fazal; Castrillon, Jeronimo, Tech Univ Dresden, Chair Compiler Construct, D-01069 Dresden, Germany. Blaesing, Robin; Parkin, Stuart, Max Planck Inst Microstruct Phys Halle, D-06120 Halle, Germany. Hameed, Fazal, Inst Space Technol, Islamabad 44000, Pakistan.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "asif\_ali.khan@tu-dresden.de fazal.hameed@tu-dresden.de blaesing@mpi-halle.mpg.de stuart.parkin@mpi-halle.mpg.de jeronimo.castrillon@tu-dresden.de", da = "2019-06-20", doc-delivery-number = "HQ4FG", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "German Research Council (DFG) through the Cluster of Excellence `Center for Advancing Electronics Dresden' (cfaed)", funding-text = "This work was partially funded by the German Research Council (DFG) through the Cluster of Excellence `Center for Advancing Electronics Dresden' (cfaed).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache; comparable access latency; Computational modeling; cycle-accurate simulator; design space exploration; domain wall memory; domain-wall-based racetrack memories; domain-wall-based RTM; emerging memory technologies; Layout; main memory; memory hierarchy; Memory management; Memory simulator; memory subsystem; memory system; models RTM-specific shift operations; Nonvolatile memory; NVM; open source cycle-accurate memory simulator; racetrack memory; Random access memory; random-access storage; RTM-specific simulation tools; RTSim; scratchpad; simulation; skyrmions-based RTM; Space exploration; storage management; Tracking", keywords-plus = "PERFORMANCE; MODEL; AREA", number-of-cited-references = "19", research-areas = "Computer Science", times-cited = "0", unique-id = "Khan:2019:RCA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Gan:2019:SSV, author = "Yiming Gan and Yuxian Qiu and Jingwen Leng and Yuhao Zhu", title = "{SVSoC}: Speculative Vision Systems-on-a-Chip", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "47--50", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2903241", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Frame latency in continuous vision significantly impacts the agility of intelligent machines that interact with the environment via cameras. However, today's continuous vision systems limit the frame latency due to their fundamental sequential execution model. We propose a speculative execution model along with two mechanisms that enable practical vision speculation. We present SVSOC, a new mobile Systems-on-a-chip (SoC) architecture that augments conventional mobile SoCs with the speculation capability. Under the same energy budget, SVSOC achieves 14.3 to 35.4 percent latency reduction in different scenarios.", acknowledgement = ack-nhfb, affiliation = "Gan, YM (Reprint Author), Univ Rochester, Comp Sci, 601 Elmwood Ave, Rochester, NY 14627 USA. Gan, Yiming; Zhu, Yuhao, Univ Rochester, Comp Sci, 601 Elmwood Ave, Rochester, NY 14627 USA. Qiu, Yuxian, Shanghai Jiao Tong Univ, Comp Sci, Shanghai 200240, Peoples R China. Leng, Jingwen, Shanghai Jiao Tong Univ, Dept Comp Sci \& Engn, Shanghai 200240, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "ygan10@ur.rochester.edu qiuyuxian@sjtu.edu.cn leng-jw@sjtu.edu.cn yzhu@rochester.edu", da = "2019-06-20", doc-delivery-number = "HS8NK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computational modeling; computer vision; Continuous vision; continuous vision systems; control engineering computing; fundamental sequential execution model; Imaging; intelligent machines; IP networks; microprocessor chips; mobile systems-on-a-chip architecture; practical vision speculation; Predictive models; Runtime; Sensors; speculation; speculation capability; speculative execution model; speculative vision systems-on-a-chip; SVSoC; system-on-chip; systems-on-a-chip; Task analysis", number-of-cited-references = "11", ORCID-numbers = "Gan, Yiming/0000-0002-2033-5057", research-areas = "Computer Science", times-cited = "0", unique-id = "Gan:2019:SSV", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Lin:2019:DSE, author = "Ting-Ru Lin and Yunfan Li and Massoud Pedram and Lizhong Chen", title = "Design Space Exploration of Memory Controller Placement in Throughput Processors with Deep Learning", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "51--54", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2905587", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "As throughput-oriented processors incur a significant number of data accesses, the placement of memory controllers (MCs) has a critical impact on overall performance. However, due to the lack of a systematic way to explore the huge design space of MC placements, only a few ad-hoc placements have been proposed, leaving much of the opportunity unexploited. In this paper, we present a novel deep-learning based framework that explores this opportunity intelligently and automatically. The proposed framework employs a genetic algorithm to efficiently guide exploration through the large design space while utilizing deep learning methods to provide fast performance prediction of design points instead of relying on slow full system simulations. Evaluation shows that, the proposed deep learning models achieves a speedup of 282X for the search process, and the MC placement found by our framework improves the average performance (IPC) of 18 benchmarks by 19.3 percent over the best-known placement found by human intuition.", acknowledgement = ack-nhfb, affiliation = "Lin, TR (Reprint Author), Univ Southern Calif, Los Angeles, CA 90007 USA. Lin, Ting-Ru; Pedram, Massoud, Univ Southern Calif, Los Angeles, CA 90007 USA. Li, Yunfan; Chen, Lizhong, Oregon State Univ, Corvallis, OR 97331 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "tingruli@usc.edu liyunf@oregonstate.edu pedram@usc.edu chenliz@oregonstate.edu", da = "2019-06-20", doc-delivery-number = "HS8NK", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Science Foundation [1566637, 1619456, 1619472, 1750047]; National Science Foundation Software and Hardware Foundations", funding-text = "We appreciate Shao-Hua Sun's assistance in DNN development. This research is supported, in part, by the National Science Foundation grants \#1566637, \#1619456, \#1619472 and \#1750047, and Software and Hardware Foundations.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "ad-hoc placements; Benchmark testing; computer architecture; Computer architecture; computer architecture; Computer architecture; critical impact; data accesses; Deep learning; deep learning; Deep learning; deep learning; deep learning methods; deep-learning based framework; design points; design space; design space exploration; fast performance prediction; genetic algorithm; genetic algorithms; Interconnection networks; Kernel; MC placement; memory architecture; memory controller placement; memory controllers; neural nets; Program processors; search problems; search process; Space exploration; Throughput; throughput processors; throughput-oriented processors", keywords-plus = "GAME; GO", number-of-cited-references = "10", ORCID-numbers = "Lin, Ting-Ru/0000-0002-7272-4070 Chen, Lizhong/0000-0001-5890-7121", research-areas = "Computer Science", times-cited = "0", unique-id = "Lin:2019:DSE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Arafa:2019:PGS, author = "Yehia Arafa and Abdel-Hameed A. Badawy and Gopinath Chennupati and Nandakishore Santhi and Stephan Eidenbenz", title = "{PPT--GPU}: Scalable {GPU} Performance Modeling", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "55--58", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2904497", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Performance modeling is a challenging problem due to the complexities of hardware architectures. In this paper, we present PPT-GPU, a scalable and accurate simulation framework that enables GPU code developers and architects to predict the performance of applications in a fast, and accurate manner on different GPU architectures. PPT-GPU is part of the open source project, Performance Prediction Toolkit (PPT) developed at the Los Alamos National Laboratory. We extend the old GPU model in PPT that predict the runtimes of computational physics codes to offer better prediction accuracy, for which, we add models for different memory hierarchies found in GPUs and latencies for different instructions. To further show the utility of PPT-GPU, we compare our model against real GPU device (s) and the widely used cycle-accurate simulator, GPGPU-Sim using different workloads from RODINIA and Parboil benchmarks. The results indicate that the predicted performance of PPT-GPU is within a 10 percent error compared to the real device(s). In addition, PPT-GPU is highly scalable, where it is up to 450x faster than GPGPU-Sim with more accurate results.", acknowledgement = ack-nhfb, affiliation = "Arafa, Y (Reprint Author), New Mexico State Univ, Klipsch Sch ECE, Las Cruces, NM 88003 USA. Arafa, Yehia; Badawy, Abdel-Hameed A., New Mexico State Univ, Klipsch Sch ECE, Las Cruces, NM 88003 USA. Badawy, Abdel-Hameed A.; Chennupati, Gopinath; Santhi, Nandakishore; Eidenbenz, Stephan, Los Alamos Natl Lab, SM 30, Los Alamos, NM 87545 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "yarafa@nmsu.edu badawy@nmsu.edu gchennupati@lanl.gov nsanthi@lanl.gov eidenben@lanl.gov", da = "2019-06-20", doc-delivery-number = "HU4EG", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "U.S. Department of Energy (DOE) National Nuclear Security Administration (NNSA) [DE-AC52-06NA25396]", funding-text = "The authors would like to thank the anonymous reviewers for their feedback which improved the quality of the paper. We would also like to thank the members of the PEARL laboratory at NMSU. Parts of this research used resources provided at the Los Alamos National Laboratory Institutional Computing Program, which is supported through the U.S. Department of Energy (DOE) National Nuclear Security Administration (NNSA) under Contract No. DE-AC52-06NA25396. Computations were run on Darwin, a research computing heterogeneous cluster. Any opinions, findings, and/or conclusions expressed in this paper do not necessarily represent the views of the DOE or the U.S. Government.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "architects; C language; Computational modeling; Computer architecture; GPGPU; GPGPU-Sim; GPU architectures; GPU device; GPU modeling; graphics processing units; Graphics processing units; Kernel; Los Alamos national laboratory; old GPU model; open source project; parallel architectures; Parboil benchmarks; performance evaluation; performance prediction; performance prediction toolkit; power aware computing; PPT; PPT-GPU; Predictive models; RODINIA; Runtime; scalable GPU Performance modeling; software/hardware co-design; Task analysis", keywords-plus = "ROOFLINE", number-of-cited-references = "22", ORCID-numbers = "Badawy, Abdel-Hameed/0000-0001-8027-1449", research-areas = "Computer Science", times-cited = "0", unique-id = "Arafa:2019:PGS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Denby:2019:OEC, author = "Bradley Denby and Brandon Lucia", title = "Orbital Edge Computing: Machine Inference in Space", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "59--62", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2907539", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Edge computing is an emerging paradigm aiding responsiveness, reliability, and scalability of terrestrial computing and sensing networks like cellular and IoT. However, edge computing is largely unexplored in high-datarate nanosatellite constellations. Cubesats are small, energy-limited sensors separated from the cloud by hundreds of kilometers of atmosphere and space. As they proliferate, centralized architectures impede advanced applications. In this work, we define and characterize Orbital Edge Computing. We describe power and software optimizations for the orbital edge, and we use formation flying to parallelize computation in space.", acknowledgement = ack-nhfb, affiliation = "Denby, B (Reprint Author), Carnegie Mellon Univ, Pittsburgh, PA 15213 USA. Denby, Bradley; Lucia, Brandon, Carnegie Mellon Univ, Pittsburgh, PA 15213 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "bdenby@andrew.cmu.edu blucia@andrew.cmu.edu", da = "2019-06-20", doc-delivery-number = "HU4EG", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Kavcic-Moura Endowment Fund; US National Science Foundation CAREER Award [1751029]", funding-text = "We thank the reviewers for the helpful feedback. This work was generously funded by the Kavcic-Moura Endowment Fund and US National Science Foundation CAREER Award \#1751029.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "artificial satellites; Cameras; computer vision; Cubesat; CubeSat; Downlink; edge computing; high-datarate nanosatellite constellations; Internet of Things; machine inference; orbital edge computing; Orbits; paradigm aiding responsiveness; Pipeline processing; remote sensing; satellite communication; Sensors; telecommunication computing; telecommunication network reliability; terrestrial computing; wireless sensor networks", number-of-cited-references = "39", research-areas = "Computer Science", times-cited = "0", unique-id = "Denby:2019:OEC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Liu:2019:UFT, author = "He Liu and Jianhui Han and Youhui Zhang", title = "A Unified Framework for Training, Mapping and Simulation of {ReRAM}-Based Convolutional Neural Network Acceleration", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "63--66", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2908374", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "ReRAM-based neural network accelerators (RNAs) could outshine their digital counterparts in terms of computational efficiency and performance remarkably. However, some open software tool for broad architectural exploration and end-to-end evaluation are still missing. We present a simulation framework of RNA for CNN inference that encompasses a ReRAM-aware NN training tool, a CNN-oriented mapper and a micro-architecture simulator. Main characteristics of ReRAM and circuits are reflected by the configurable simulator, as well as by the customized training algorithm. The function of the simulator's core components is verified by the corresponding circuit simulation of a real chip design. This framework enables comprehensive architectural exploration and end-to-end evaluation, and it's preliminary version is available at https://github.com/CRAFT-THU/XB-Sim.", acknowledgement = ack-nhfb, affiliation = "Zhang, YH (Reprint Author), Tsinghua Univ, Dept Comp Sci \& Technol, Beijing 100084, Peoples R China. Liu, He; Zhang, Youhui, Tsinghua Univ, Dept Comp Sci \& Technol, Beijing 100084, Peoples R China. Han, Jianhui, Tsinghua Univ, Inst Microelect, Beijing 100084, Peoples R China.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "liuhe94@hotmail.com hanjh16@mails.tsinghua.edu.cn zyh02@tsinghua.edu.cn", da = "2019-06-20", doc-delivery-number = "HU4EG", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Beijing Innovation Center for Future Chip; Science and Technology Innovation Special Zone project, China; HUAWEI project", funding-text = "Thanks for the support from Beijing Innovation Center for Future Chip, the support of the Science and Technology Innovation Special Zone project, China, and the support of HUAWEI project.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "accelerator; Artificial neural networks; CNN inference; CNN-oriented mapper; computational efficiency; Computational modeling; Computer architecture; configurable simulator; convolutional neural nets; customized training algorithm; Deep neural network; digital counterparts; end-to-end evaluation; Hardware; learning (artificial intelligence); microarchitecture simulator; Microprocessors; open software tool; processing-in-memory; ReRAM; ReRAM-aware NN training tool; ReRAM-based convolutional neural network acceleration; ReRAM-based neural network accelerators; RNA; simulation; Training", number-of-cited-references = "22", ORCID-numbers = "Liu, He/0000-0002-9117-5265 Han, Jianhui/0000-0002-8705-134X", research-areas = "Computer Science", times-cited = "0", unique-id = "Liu:2019:UFT", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Tan:2019:DWO, author = "Tian Tan and Eriko Nurvitadhi and Derek Chiou", title = "Dark Wires and the Opportunities for Reconfigurable Logic", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "67--70", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2909867", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Power has become a fundamental limit to silicon performance. Most research has focused on reducing transistor switching to constrain power (dark silicon.) Specialized accelerators have been proposed since they implement functionality with fewer transistor switches than general purpose cores. Increasing efficiency requirements lead to more specialization and, therefore, more accelerators that potentially leads to longer distances to get to all the accelerators. Communication, however, consumes energy, and therefore needs to be minimized as well (dark wires.) This paper examines the balance between compute and communication specialization in the context of hard logic (e.g., ASIC) that is highly efficient but static versus soft logic (e.g., FPGA) that is less efficient but allows computation to be moved to reduce communication distances. Our experimental results show using soft accelerators consumes 0.6$ \times $-2.1$ \times $ total power compared to using hard accelerators when communication costs are taken into account.", acknowledgement = ack-nhfb, affiliation = "Tan, T (Reprint Author), Univ Texas Austin, Elect \& Comp Engn, Austin, TX 78712 USA. Tan, Tian, Univ Texas Austin, Elect \& Comp Engn, Austin, TX 78712 USA. Nurvitadhi, Eriko, Intel Corp, Santa Clara, CA 95054 USA. Chiou, Derek, Univ Texas Austin, Austin, TX 78712 USA. Chiou, Derek, Microsoft, Austin, TX 78712 USA.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "tan.tian@utexas.edu eriko.nurvitadhi@intel.com derek@utexas.edu", da = "2019-06-20", doc-delivery-number = "HW7ZH", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Intel Corporation, Hillsboro, OR", funding-text = "Funding for this work was provided by Intel Corporation, Hillsboro, OR. The authors would like to thank the colleagues in the Accelerator Architecture Lab at Intel Corporation, Hillsboro, OR and FAST research group at the University of Texas at Austin, Austin, TX for the discussion and feedback.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "application specific integrated circuits; ASIC; Benchmark testing; communication costs; communication distances; communication specialization; dark silicon; dark wires; efficiency requirements; elemental semiconductors; energy efficient architecture; Field programmable gate arrays; field programmable gate arrays; FPGA; fundamental limit; general purpose cores; geographical locality; hard logic; hardware accelerator; Layout; low-power electronics; reconfigurable logic; Silicon; silicon performance; soft accelerators; Specialized accelerators; static versus soft logic; Throughput; transistor circuits; transistor switches; transistor switching; Transistors; wires; Wires", number-of-cited-references = "14", research-areas = "Computer Science", times-cited = "0", unique-id = "Tan:2019:DWO", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Naithani:2019:PRE, author = "Ajeya Naithani and Josue Feliu and Almutaz Adileh and Lieven Eeckhout", title = "Precise Runahead Execution", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "71--74", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2910518", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Runahead execution improves processor performance by accurately prefetching long-latency memory accesses. When a long-latency load causes the instruction window to fill up and halt the pipeline, the processor enters runahead mode and keeps speculatively executing code to trigger accurate prefetches. A recent improvement tracks the chain of instructions that leads to the long-latency load, stores it in a runahead buffer, and executes only this chain during runahead execution, with the purpose of generating more prefetch requests during runahead execution. Unfortunately, all these prior runahead proposals have shortcomings that limit performance and energy efficiency because they discard the full instruction window to enter runahead mode and then flush the pipeline to restart normal operation. This significantly constrains the performance benefits and increases the energy overhead of runahead execution. In addition, runahead buffer limits prefetch coverage by tracking only a single chain of instructions that lead to the same long-latency load. We propose precise runahead execution (PRE) to mitigate the shortcomings of prior work. PRE leverages the renaming unit to track all the dependency chains leading to long-latency loads. PRE uses a novel approach to manage free processor resources to execute the detected instruction chains in runahead mode without flushing the pipeline. Our results show that PRE achieves an additional 21.1 percent performance improvement over the recent runahead proposals while reducing energy consumption by 6.1 percent.", acknowledgement = ack-nhfb, affiliation = "Naithani, A (Reprint Author), Univ Ghent, B-9000 Ghent, Belgium. Naithani, Ajeya; Adileh, Almutaz; Eeckhout, Lieven, Univ Ghent, B-9000 Ghent, Belgium. Feliu, Josue, Univ Politecn Valencia, Valencia 46010, Spain.", ajournal = "IEEE Comput. Archit. Lett.", author-email = "ajeya.naithani@ugent.be jofepre@gap.upv.es almutaz.adileh@ugent.be lieven.eeckhout@ugent.be", da = "2019-06-20", doc-delivery-number = "HW9SJ", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "FWO [G.0434.16N, G.0144.17N]; European Research Council (ERC) [741097]", funding-text = "This research is supported through FWO grants no. G.0434.16N and G.0144.17N, and European Research Council (ERC) Advanced Grant agreement no. 741097.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Buffer storage; dependency chains; energy efficiency; instruction window; long-latency load; long-latency memory accesses; Microarchitecture; Microsoft Windows; Out of order; pipeline processing; Pipelines; power aware computing; precise runahead execution; prefetch requests; Prefetching; Proposals; Registers; runahead buffer limits; runahead execution; single-core performance; storage management", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "0", unique-id = "Naithani:2019:PRE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Agrawal:2019:MPS, author = "V. Agrawal and M. A. Dinani and Y. Shui and M. Ferdman and N. Honarmand", title = "Massively Parallel Server Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "75--78", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2911287", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Modern data centers enjoy massive degrees of request-level parallelism with significant cross-request similarity. Although similar requests follow similar instruction sequences, conventional processors service them individually and do not take full advantage of cross-request similarity. Single-Instruction Multiple-Thread (SIMT) architectures can leverage this similarity, however, existing SIMT processors chief among them, GPUs are ill-suited for server applications, as they are specifically designed to maximize throughput at the expense of latency, preventing them from meeting server QoS requirements. We advocate a new approach to SIMT server processors, namely Massively Parallel Server Processors (MPSPs), which we outline in this paper. To begin to understand their architectural needs, we measure the degree of control-flow and memory-access divergence encountered when running unmodified server applications on MPSP-style processors. Our preliminary results indicate that a software scheduler that bundles together similar requests can minimize control-flow divergence, making SIMT execution of unmodified server code feasible. Moreover, we find that memory-access divergence, although significant in raw numbers, can be tackled with changes in stack and heap layouts. Overall, our results encourage further consideration of MPSPs as a promising architecture for server processors.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; data centers; Instruction sets; Message systems; Parallel processing; Quality of service; servers; Servers; Single Instruction Multiple Thread", } @Article{Golestani:2019:PMB, author = "H. Golestani and G. Gupta and R. Sen", title = "Performance Modeling and Bottleneck Analysis of {EDGE} Processors Using Dependence Graphs", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "79--82", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2911514", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Exploring new directions in ISA and microarchitecture design can be challenging due to the large search space. Efficient tools and methods are needed to quickly identify rewarding design choices. In this work, we develop a graph-based framework that effectively models complex architectures and enables efficient analysis of their performance and bottlenecks. We use this framework to investigate proposals for EDGE (Explicit Data Graph Execution) ISA, a new class of ISA in which programs are composed from atomic blocks, each of which explicitly exposes dataflow to hardware. We study the impact of two important EDGE-specific design choices: block formats and operand-movement instructions. We demonstrate how this analysis leads to insights in EDGE architectures.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Analytical models; Bottleneck analysis; Data models; EDGE; EDGE (Explicit Data Graph Execution); Hardware; Hazards; ISA; Load modeling; Microarchitecture; microarchitecture; Microarchitecture; microarchitecture; performance modeling; Program processors", } @Article{Leng:2019:ARA, author = "J. Leng and A. Buyuktosunoglu and R. Bertran and P. Bose and V. J. Reddi", title = "Asymmetric Resilience for Accelerator-Rich Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "1", pages = "83--86", month = jan # "\slash " # jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2917898", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Accelerators are becoming popular owing to their exceptional performance and power-efficiency. However, researchers are yet to pay close attention to their reliability a key challenge as technology scaling makes building reliable systems challenging. A straightforward solution to make accelerators reliable is to design the accelerator from the ground-up to be reliable by itself. However, such a myopic view of the system, where each accelerator is designed in isolation, is unsustainable as the number of integrated accelerators continues to rise in SoCs. To address this challenge, we propose a paradigm called asymmetric resilience that avoids accelerator-specific reliability design. Instead, its core principle is to develop the reliable heterogeneous system around the CPU architecture. We explain the implications of architecting such a system and the modifications needed in a heterogeneous system to adopt such an approach. As an example, we demonstrate how to use asymmetric resilience to handle GPU execution errors using the CPU with minimal overhead. The general principles can be extended to include other accelerators.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "accelerator architecture; error recovery; Graphics processing units; heterogeneous system; Kernel; Memory management; Reliability; Resilience; Runtime; soft errors; Task analysis; voltage noise", } @Article{Sadredini:2019:SEM, author = "E. Sadredini and R. Rahimi and V. Verma and M. Stan and K. Skadron", title = "A Scalable and Efficient In-Memory Interconnect Architecture for Automata Processing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "87--90", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2909870", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Oct 1 10:18:16 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Accelerating finite automata processing benefits regular-expression workloads and a wide range of other applications that do not map obviously to regular expressions, including pattern mining, bioinfomatics, and machine learning. Existing in-memory automata processing accelerators suffer from inefficient routing architectures. They are either incapable of efficiently place-and-route a highly connected automaton or require an excessive amount of hardware resources. In this paper, we propose a compact, low-overhead, and yet flexible in-memory interconnect architecture that efficiently implements routing for next-state activation, and can be applied to the existing in-memory automata processing architectures. We use SRAM 8T subarrays to evaluate our interconnect. Compared to the Cache Automaton routing design, our interconnect reduces the number of switches $ 7 \times $, therefore, reduces area overhead for the interconnect. It also has faster row cycle time because of shorter wires and consumes less power.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Automata; automata processing; bioinfomatics; cache automaton routing design; connected automaton; finite automata; finite automata processing; Hardware; hardware resources; in-memory automata; in-memory automata processing accelerators; in-memory interconnect architecture; Indexes; inefficient routing architectures; integrated circuit interconnections; Interconnect; machine learning; memory architecture; Memory management; next-state activation; pattern mining; processing in memory; Random access memory; regular expression workloads; Routing; SRAM 8T subarrays; SRAM chips", } @Article{Yasin:2019:TPM, author = "A. Yasin and A. Mendelson and Y. Ben-Asher", title = "Tuning Performance via Metrics with Expectations", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "91--94", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2916408", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Oct 1 10:18:16 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Modern server systems employ many features that are difficult to exploit by software developers. This paper calls for a new performance optimization approach that uses designated metrics with expected optimal values. A key insight is that expected values of these metrics are essential in order to verify that no performance is wasted during incremental utilization of processor features. We define sample primary metrics for modern architectures and present three distinct techniques that help to determine their optimal values. Our preliminary results successfully provide 2x-4x extra speedup during tuning of commonly-used software optimizations on the matrix-multiply kernel. Additionally, our approach helped to identify counter-intuitive causes that hurt multicore scalability of an optimized deep-learning benchmark on a Cascade Lake server.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Cascade Lake server; Code tuning; counter-intuitive cause identification; expectations; expected optimal values; incremental utilization; Kernel; learning (artificial intelligence); matrix multiplication; matrix-multiply kernel; Measurement; measurements; micro-architecture; microprocessor chips; modern server systems; multi-core/single-chip multiprocessors; Multicore processing; multiprocessing systems; Optimization; optimization; optimized deep-learning benchmark; performance analysis; performance evaluation; performance optimization approach; processor features; sample primary metrics; Servers; SIMD processors; software metrics; software optimizations; Tuning; tuning performance", } @Article{Wang:2019:MEM, author = "L. Wang and M. Jahre and A. Adileh and Z. Wang and L. Eeckhout", title = "Modeling Emerging Memory-Divergent {GPU} Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "95--98", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2923618", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Oct 1 10:18:16 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", abstract = "Analytical performance models yield valuable architectural insight without incurring the excessive runtime overheads of simulation. In this work, we study contemporary GPU applications and find that the key performance-related behavior of such applications is distinct from traditional GPU applications. The key issue is that these GPU applications are memory-intensive and have poor spatial locality, which implies that the loads of different threads commonly access different cache blocks. Such memory-divergent applications quickly exhaust the number of misses the L1 cache can process concurrently, and thereby cripple the GPU's ability to use Memory-Level Parallelism (MLP) and Thread-Level Parallelism (TLP) to hide memory latencies. Our Memory Divergence Model (MDM) is able to accurately represent this behavior and thereby reduces average performance prediction error by $ 14 \times $ compared to the state-of-the-art GPUMech approach across our memory-divergent applications.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Analytical models; analytical performance models; Analytical performance prediction; average performance prediction error; cache blocks; cache storage; Computational modeling; contemporary GPU applications; GPU; graphics processing units; Graphics processing units; Instruction sets; key performance-related behavior; L1 cache; Mathematical model; memory architecture; memory divergence model; memory latencies; memory-divergent applications; memory-divergent GPU applications; memory-intensive; memory-level parallelism; multi-threading; multiprocessing systems; Predictive models; Random access memory; thread-level parallelism; traditional GPU applications; valuable architectural insight", } @Article{Shomron:2019:SSS, author = "G. Shomron and T. Horowitz and U. Weiser", title = "{SMT-SA}: Simultaneous Multithreading in Systolic Arrays", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "99--102", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2924007", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Oct 1 10:18:16 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Systolic arrays (SAs) are highly parallel pipelined structures capable of executing various tasks such as matrix multiplication and convolution. They comprise a grid of usually homogeneous processing units (PUs) that are responsible for the multiply-accumulate (MAC) operations in the case of matrix multiplication. It is not rare for a PU input to be zero-valued, in which case the PU becomes idle and the array becomes underutilized. In this paper we consider a solution to employ the underutilized PUs via simultaneous multithreading (SMT). We explore the design space of a SMT-SA variant and evaluate its performance, area efficiency, and energy consumption. In addition, we suggest a tiling method to reduce area overheads. Our evaluation shows that a 4-thread FP16-based SMT-SA achieves speedups of up to $ 3.6 \times $ as compared to conventional SA, with $ 1.7 \times $ area overhead and negligible energy overhead.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "4-thread FP16-based SMT-SA; area efficiency; Convolution; Correlation; Deep learning; Energy consumption; energy consumption; homogeneous processing units; Instruction sets; matrix multiplication; multi-threading; multiply-accumulate operations; Multithreading; multithreading; parallel pipelined structures; PU input; simultaneous multithreading; SMT-SA variant; Systolic arrays; systolic arrays; Task analysis", } @Article{Masouros:2019:RRS, author = "D. Masouros and S. Xydis and D. Soudris", title = "{Rusty}: Runtime System Predictability Leveraging {LSTM} Neural Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "103--106", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2924622", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Oct 1 10:18:16 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Modern cloud scale data-centers are adopting workload co-location as an effective mechanism for improving resource utilization. However, workload co-location is stressing resource availability in unconventional and unpredictable manner. Efficient resource management requires continuous and ideally predictive runtime knowledge of system metrics, sensitive both to workload demands, e.g., CPU, memory etc., as well as interference effects induced by co-location. In this paper, we present Rusty, a framework able to address the aforementioned challenges by leveraging the power of Long Short-Term Memory networks to forecast at runtime, performance metrics of applications executed on systems under interference. We evaluate Rusty under a diverse set of interference scenarios for a plethora of cloud workloads, showing that Rusty achieves extremely high prediction accuracy, up to 0.99 in terms of R2 value, satisfying at the same time the strict latency constraints to be usable at runtime.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; cloud computing; cloud workloads; computer centres; Correlation; datacenters; extremely high prediction accuracy; interference; Interference; interference effects; interference scenarios; long short-term memory networks; LSTM neural networks; Measurement; modern cloud scale data-centers; Monitoring; recurrent neural nets; resource allocation; resource availability; Resource management; resource management; resource utilization; Run-time system predictability; Runtime; runtime knowledge; runtime system predictability leveraging LSTM neural networks; Rusty; system metrics; unconventional manner; workload co-location", } @Article{Kim:2019:THA, author = "S. Kim and H. Jung and W. Shin and H. Lee and H. Lee", title = "{HAD-TWL}: Hot Address Detection-Based Wear Leveling for Phase-Change Memory Systems with Low Latency", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "107--110", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2929393", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Oct 1 10:18:16 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Phase-change memory (PCM) is an emerging non-volatile memory device that offers faster access than flash memory does. However, PCM suffers from a critical problem where the number of write operations is limited. The previous practical attack detector (PAD) that uses a small memory space called stack adopts an algebraic mapping-based wear leveling (AWL) algorithm. Thanks to successful detection of malicious attacks, the PAD-AWL dramatically improves the lifetime of PCM. To enhance system factors such as write latency, the proposed method replaces the AWL algorithm with a table-based wear leveling (TWL) algorithm. Since the fixed stack size of the previous PAD is inefficient in detection of attack-like hot addresses, a stack size modulation scheme that enables a hot address detector (HAD) to efficiently counteract various memory write streams is proposed. Compared with the previous AWL-based algorithm, the integration with the TWL algorithm demands only 24 percent of the total number of swaps per write, and the proposed HAD with the stack size modulation scheme achieves the detection rate of 94 percent while reducing the execution time by 57 percent.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "algebraic mapping-based wear leveling algorithm; attack-like hot addresses; AWL-based algorithm; detection rate; Detectors; embedded memory management system; emerging nonvolatile memory device; endurance; fixed stack size; flash memories; flash memory; HAD-TWL; Hardware; hot address detection-based wear leveling; hot address detector; malicious attacks; Memory management; memory space; PAD-AWL; PCM; Phase change materials; phase change memories; Phase-change memory; phase-change memory systems; practical attack detector; Pulse modulation; Random access memory; stack size modulation scheme; system factors; table-based wear leveling algorithm; TWL algorithm; wear; wear leveling; write operations", } @Article{Zhou:2019:QCD, author = "H. Zhou and G. T. Byrd", title = "Quantum Circuits for Dynamic Runtime Assertions in Quantum Computation", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "111--114", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2935049", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Oct 1 10:18:16 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "In this paper, we propose quantum circuits for runtime assertions, which can be used for both software debugging and error detection. Runtime assertion is challenging in quantum computing for two key reasons. First, a quantum bit (qubit) cannot be copied, which is known as the non-cloning theorem. Second, when a qubit is measured, its superposition state collapses into a classical state, losing the inherent parallel information. In this paper, we overcome these challenges with runtime computation through ancilla qubits, which are used to indirectly collect the information of the qubits of interest. We design quantum circuits to assert classical states, entanglement, and superposition states.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "ancilla qubits; assertions; classical state; Debugging; debugging; dynamic runtime assertions; error detection; inherent parallel information; Logic gates; Measurement uncertainty; noncloning theorem; program debugging; quantum bit; quantum circuits; quantum circuits design; quantum computation; quantum computing; Quantum computing; quantum entanglement; Quantum entanglement; quantum error detection; Qubit; qubit; Runtime; runtime assertion; runtime computation; software debugging; superposition state", } @Article{Rao:2019:ATC, author = "J. Rao and T. Ao and K. Dai and X. Zou", title = "{ARCE}: Towards Code Pointer Integrity on Embedded Processors Using Architecture-Assisted Run-Time Metadata Management", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "115--118", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2935445", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Oct 1 10:18:16 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Code Pointer Integrity (CPI) is an efficient control flow protection technique focusing on sensitive code pointers with a formal proof of security, but it relies on software lookup tables or Memory Management Unit (MMU) based address translation and instruction-level memory isolation which are impractical for resource-constrained embedded processors. This paper enables Architecture-assisted Run-time CPI on Embedded Processors (ARCE) with 2-level metadata to balance security, performance and resource overhead. The first level 2-bit property metadata colors data into different domains and the second level boundary metadata holds structure constraints for indirect code pointers only. With memory and instruction extensions, metadata shares the address space with program data and is propagated at runtime to maintain a precise set of sensitive code pointers. It lazily validates the content and boundary of sensitive pointers at dereference stage to eliminate false alarms. We implemented ARCE based on a shallow 3-stage pipeline processor Z-scale and validated its security effectiveness with code pointer attack vectors in RIPE. It introduces less than 1 percent performance overhead for benchmarks in C with 7.33 percent logic and 6.25 percent memory overhead. ARCE eliminates address space waste and dependency on advanced hardware which makes CPI practical even for systems with bare metal applications.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "ARCE; architecture-assisted run-time CPI on embedded processors; architecture-assisted run-time metadata management; code pointer attack vectors; code pointer integrity; Code pointer integrity; control flow protection technique; data flow analysis; embedded processors; embedded systems; first level 2-bit property metadata colors data; Hardware; indirect code pointers; instruction extensions; instruction set extensions; instruction sets; instruction-level memory isolation; Integrated circuits; level boundary metadata; Memory management; memory management unit based address translation; meta data; Metadata; microprocessor chips; MMU; multi-level metadata; pipeline processing; Program processors; Registers; resource-constrained embedded processors; RIPE; security; Security; security of data; sensitive code pointers; shallow 3-stage pipeline processor Z-scale; software lookup tables; storage management; table lookup", } @Article{Bhardwaj:2019:DOC, author = "K. Bhardwaj and M. Havasi and Y. Yao and D. M. Brooks and J. M. H. Lobato and G. Wei", title = "Determining Optimal Coherency Interface for Many-Accelerator {SoCs} Using {Bayesian} Optimization", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "119--123", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2910521", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Oct 1 10:18:16 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The modern system-on-chip (SoC) of the current exascale computing era is complex. These SoCs not only consist of several general-purpose processing cores but also integrate many specialized hardware accelerators. Three common coherency interfaces are used to integrate the accelerators with the memory hierarchy: non-coherent,coherent with the last-level cache (LLC), and fully-coherent.However, using a single coherence interface for all the accelerators in an SoC can lead to significant overheads: in the non-coherent model, accelerators directly access the main memory, which can have considerable performance penalty; whereas in the LLC-coherent model, the accelerators access the LLC but may suffer from performance bottleneck due to contention between several accelerators; and the fully-coherent model, that relies on private caches, can incur non-trivial power/area overheads. Given the limitations of each of these interfaces, this paper proposes a novel performance-aware hybrid coherency interface, where different accelerators use different coherency models, decided at design time based on the target applications so as to optimize the overall system performance. A new Bayesian optimization based framework is also proposed to determine the optimal hybrid coherency interface, i.e., use machine learning to select the best coherency model for each of the accelerators in the SoC in terms of performance. For image processing and classification workloads, the proposed framework determined that a hybrid interface achieves up to 23 percent better performance compared to the other homogeneous interfaces, where all the accelerators use a single coherency model.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bayes methods; Bayesian optimization; Coherence; coherence protocols; Computational modeling; Hardware; hardware accelerators; Optimization; Program processors; Protocols; System-on-chip (SoC)", } @Article{Ansari:2019:CLO, author = "Ali Ansari and Pejman Lotfi-Kamran and Hamid Sarbazi-Azad", title = "Code Layout Optimization for Near-Ideal Instruction Cache", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "124--127", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2924429", ISSN = "1556-6064", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Instruction cache misses are a significant source of performance degradation in server workloads because of their large instruction footprints and complex control flow. Due to the importance of reducing the number of instruction cache misses, there has been a myriad of proposals for hardware instruction prefetchers in the past two decades. While effectual, state-of-the-art hardware instruction prefetchers either impose considerable storage overhead or require significant changes in the frontend of a processor. Unlike hardware instruction prefetchers, code-layout optimization techniques profile a program and then reorder the code layout of the program to increase spatial locality, and hence, reduce the number of instruction cache misses. While an active area of research in the 1990s, code-layout optimization techniques have largely been neglected in the past decade. We evaluate the suitability of code-layout optimization techniques for modern server workloads and show that if we combine these techniques with a simple next-line prefetcher, they can significantly reduce the number of instruction cache misses. Moreover, we propose a new code-layout optimization algorithm and show that along with a next-line prefetcher, it offers the same performance improvement as the state-of-the-art hardware instruction prefetcher, but with almost no hardware overhead.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "basic-block reordering; Cache storage; code-layout optimization; Encoding; Instruction cache miss; instruction prefetcher; Instruction sets; Optimization; Prefetching", } @Article{Ranganath:2019:SCC, author = "Kiran Ranganath and AmirAli Abdolrashidi and Shuaiwen Leon Song and Daniel Wong", title = "Speeding up Collective Communications Through Inter-{GPU} Re-Routing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "128--131", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2933842", ISSN = "1556-6064", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "In order to address the vast needs of disparate domains, computing engines are becoming more sophisticated and complex. A typical high-performance computational engine is composed of several accelerator units, in most cases GPUs, plus one or more CPU controllers. All these components are becoming increasingly interconnected to satisfy bandwidth and latency tolerance demands from modern workloads. Due to these constraints, solutions to efficiently interconnect them or to systematically manage their traffic-such as PCIe v3, NVLink v1 and v2 on the hardware side, and NVIDIA Collective Communication Library (NCCL) and AMD ROCM layer on the software side-are becoming more commonplace inside HPC systems and cloud data centers. However, as the number of accelerators increases, workloads (especially machine learning) might not be able to fully exploit the computational substrate due to inefficient use of hardware interconnects. Such scenarios can lead to performance bottlenecks where high-bandwidth links are not used by the underlying libraries and under-performing links are overused. This work proposes Workload Optimization Through Inter-GPU Re-routing (WOTIR), which consists of enhanced NCCL-based collective primitives that aim to boost bandwidth utilization (through more efficient routing) and reduce communication overhead. WOTIR targets GPUs with no direct NVLink communication path (which leads to PCIe communications) and instead re-routes communication through intermediate GPUs to bridge NVLink segments and avoid PCIe communications. Such method allows the maximum possible utilization of the NVLink bandwidth between the GPUs without routing through the PCIe bus. Using this method, we see a reduction of up to 34 percent in execution time for selected machine learning workloads when non-optimal GPU allocations arise.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Collective communication; GPU; Graphics processing units; interconnect; Interference; Machine learning; Routing; Servers; Training data", } @Article{Stow:2019:PPM, author = "Dylan Stow and Amin Farmahini-Farahani and Sudhanva Gurumurthi and Michael Ignatowski and Yuan Xie", title = "Power Profiling of Modern Die-Stacked Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "132--135", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2941715", ISSN = "1556-6064", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Die-stacked memories that integrate multiple DRAM dies into the processor package have reduced the interface bottleneck and improved efficiency, but demands for memory capacity and bandwidth remain unfulfilled. Additionally, the introduction of memory into the package further complicates heat removal. Memory power is therefore becoming a key architectural concern. To provide insight into these challenges, an architectural power model for High Bandwidth Memory is developed, validated, and used to provide detailed power profiles. Based on the resulting power trends, power is projected for potential future memory configurations with increased bandwidth and capacity. The results suggest that, without significant improvements in memory technology or architecture, the power utilization of in-package memories will continue to grow and limit the system power budget.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Integrated circuits; Memory management; Power measurement; Power system measurement; Random access memory; random access memory; Three-dimensional displays; three-dimensional integrated circuits", } @Article{Nabavinejad:2019:CDP, author = "Seyed Morteza Nabavinejad and Hassan Hafez-Kolahi and Sherief Reda", title = "Coordinated {DVFS} and Precision Control for Deep Neural Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "136--140", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2942020", ISSN = "1556-6064", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Traditionally, DVFS has been the main mechanism to trade-off performance and power. We observe that Deep Neural Network (DNN) applications offer the possibility to trade-off performance, power, and accuracy using both DVFS and numerical precision levels. Our proposed approach, Power-Inference accuracy Trading (PIT), monitors the server's load, and accordingly adjusts the precision of the DNN model and the DVFS setting of GPU to trade-off the accuracy and power consumption with response time. At high loads and tight request arrivals, PIT leverages INT8-precision instructions of GPU to dynamically change the precision of deployed DNN models and boosts GPU frequency to execute the requests faster at the expense of accuracy reduction and high power consumption. However, when the requests' arrival rate is relaxed and there is slack time for requests, PIT deploys high precision version of models to improve the accuracy and reduces GPU frequency to decrease power consumption. We implement and deploy PIT on a state-of-the-art server equipped with a Tesla P40 GPU. Experimental results demonstrate that depending on the load, PIT can improve response time up to 11 percent compared to a job scheduler that uses only FP32 precision. It also improves the energy consumption by up to 28 percent, while achieving around 99.5 percent accuracy of sole FP32-precision.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "accuracy; Deep neural network; Graphics processing units; hardware accelerator; Neural networks; power; Power demand; response time; Runtime; Servers; Time factors; Time-frequency analysis", } @Article{Lee:2019:ELM, author = "Seunghak Lee and Nam Sung Kim and Daehoon Kim", title = "Exploiting {OS}-Level Memory Offlining for {DRAM} Power Management", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "141--144", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2942914", ISSN = "1556-6064", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Power and energy consumed by main memory systems in data-center servers have increased as the DRAM capacity and bandwidth increase. Particularly, background power accounts for a considerable fraction of the total DRAM power consumption; the fraction will increase further in the near future, especially when slowing-down technology scaling forces us to provide necessary DRAM capacity through plugging in more DRAM modules or stacking more DRAM chips in a DRAM package. Although current DRAM architecture supports low power states at rank granularity that turn off some components during idle periods, techniques to exploit memory-level parallelism make the rank-granularity power state become ineffective. Furthermore, the long wake-up latency is one of obstacles to adopting aggressive power management (PM) with deep power-down states. By tackling the limitations, we propose OffDIMM that is a software-assisted DRAM PM collaborating with the OS-level memory onlining/offlining. OffDIMM maps a memory block in the address space of the OS to a subarray group or groups of DRAM, and sets a deep power-down state for the subarray group when offlining the block. Through the dynamic OS-level memory onlining/offlining based on the current memory usage, our experimental results show OffDIMM reduces background power by 24 percent on average without notable performance overheads.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "DRAM; Energy consumption; Hardware; Linux; Memory management; memory offlining; power management; Power system management; Random access memory", } @Article{Marinakis:2019:PFI, author = "Theodoros Marinakis and Iraklis Anagnostopoulos", title = "Performance and Fairness Improvement on {CMPs} Considering Bandwidth and Cache Utilization", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "1--4", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2944810", ISSN = "1556-6064", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Chip multiprocessors (CMPs) have become dominant both in server and embedded domain as they accommodate an increasing amount of cores in order to satisfy the workload demands. However, when applications run concurrently, they compete for shared resources, such as Last Level Cache (LLC) and main memory bandwidth. Applications are affected in various ways by contention, and uneven degradation makes the system unreliable and the overall performance unpredictable. The goal of this work is to improve performance by sophisticated grouping that balances bandwidth and LLC requirements, while at the same time providing a fair execution environment by prioritizing applications that experience the least accumulated progress. The proposed scheduler achieves an average performance gain of 16 percent over the Linux scheduler and 6.3 percent over another performance-oriented scheduler. Additionally, it keeps unfairness very close to two fairness-oriented schedulers.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Chip multiprocessors; contention-aware scheduling; Degradation; fairness; Interference; Job shop scheduling; Linux; performance; Quality of service; Resource management", } @Article{Balaji:2019:FEW, author = "Adarsha Balaji and Shihao Song and Anup Das and Nikil Dutt and Jeff Krichmar and Nagarajan Kandasamy and Francky Catthoor", title = "A Framework to Explore Workload-Specific Performance and Lifetime Trade-offs in Neuromorphic Computing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "149--152", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2951507", ISSN = "1556-6064", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Neuromorphic hardware with non-volatile memory (NVM) can implement machine learning workload in an energy-efficient manner. Unfortunately, certain NVMs such as phase change memory (PCM) require high voltages for correct operation. These voltages are supplied from an on-chip charge pump. If the charge pump is activated too frequently, its internal CMOS devices do not recover from stress, accelerating their aging and leading to negative bias temperature instability (NBTI) generated defects. Forcefully discharging the stressed charge pump can lower the aging rate of its CMOS devices, but makes the neuromorphic hardware unavailable to perform computations while its charge pump is being discharged. This negatively impacts performance such as latency and accuracy of the machine learning workload being executed. In this letter, we propose a novel framework to exploit workload-specific performance and lifetime trade-offs in neuromorphic computing. Our framework first extracts the precise times at which a charge pump in the hardware is activated to support neural computations within a workload. This timing information is then used with a characterized NBTI reliability model to estimate the charge pump's aging during the workload execution. We use our framework to evaluate workload-specific performance and reliability impacts of using (1) different SNN mapping strategies and (2) different charge pump discharge strategies. We show that our framework can be used by system designers to explore performance and reliability trade-offs early in the design of neuromorphic hardware such that appropriate reliability-oriented design margins can be set.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Aging; and inter-spike interval (ISI); Charge pumps; Negative bias temperature instability; negative bias temperature instability (NBTI); Neuromorphic computing; Neuromorphics; non-volatile memory (NVM); phase-change memory (PCM); spiking neural networks (SNNs); Synapses; Thermal variables control; wear-out", } @Article{Jeon:2019:LAG, author = "Hyeran Jeon and Hodjat Asghari Esfeden and Nael B. Abu-Ghazaleh and Daniel Wong and Sindhuja Elango", title = "Locality-Aware {GPU} Register File", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "153--156", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2959298", ISSN = "1556-6064", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "In many emerging applications such as deep learning, large data set is essential to generate reliable solutions. In these big data workloads, memory latency and bandwidth are the main performance bottlenecks. In this article, we propose a locality-aware GPU register file that enables data sharing for memory-intensive big data workloads on GPUs without relying on small on-chip memories. We exploit two types of data sharing patterns commonly found from the big data workloads and have warps opportunistically share data in physical registers instead of issuing memory loads separately and storing the same data redundantly in their registers as well as small shared memory. With an extended register file mapping mechanism, our proposed design enables warps to share data by simply mapping to the same physical registers or reconstructing from the data in the register file already. The proposed sharing not only reduces the memory transactions but also further decreases the register file usage. The spared registers make rooms for applying orthogonal optimizations for energy and performance improvement. Our evaluation on two deep learning workloads and matrixMul show that the proposed locality-aware GPU register file achieves over $ 2 \times $ speedup and saves register space up to 57 percent.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Big Data; convolution neural network; Deep learning; GPU; Graphics processing units; Matrix operations; Registers; System-on-chip", } @Article{Li:2019:PBP, author = "Chen Li and Yifan Sun and Lingling Jin and Lingjie Xu and Zheng Cao and Pengfei Fan and David Kaeli and Sheng Ma and Yang Guo and Jun Yang", title = "Priority-Based {PCIe} Scheduling for Multi-Tenant Multi-{GPU} Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "157--160", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2955119", ISSN = "1556-6064", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Multi-GPU systems are widely used in data centers to provide significant speedups to compute-intensive workloads such as deep neural network training. However, limited PCIe bandwidth between the CPU and multiple GPUs becomes a major performance bottleneck. We observe that relying on a traditional Round-Robin-based PCIe scheduling policy can result in severe bandwidth competition and stall the execution of multiple GPUs. In this article, we propose a priority-based scheduling policy which aims to overlap the data transfers and GPU execution for different applications to alleviate this bandwidth contention. We also propose a dynamic priority policy for semi-QoS management that can help applications to meet QoS requirements and improve overall multi-GPU system throughput. Experimental results show that the system throughput is improved by 7.6 percent on average using our priority-based PCIe scheduling scheme as compared with a Round-Robin-based PCIe scheduler. Leveraging semi-QoS management can help to meet defined QoS goals, while preserving application throughput.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Data transfer; Graphics processing units; Multi-GPU; multi-tenant; PCIe scheduling; Quality of service; Switches; Task analysis; Throughput", } @Article{Weng:2019:DMC, author = "Jian Weng and Sihao Liu and Vidushi Dadu and Tony Nowatzki", title = "{DAEGEN}: a Modular Compiler for Exploring Decoupled Spatial Accelerators", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "161--165", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2955456", ISSN = "1556-6064", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Specialized hardware accelerators, particularly those that are programmable and flexible to target multiple problems in their domain, have proven to provide orders of magnitude speedup and energy efficiency. However, their design requires extensive manual effort, due to the need for hardware-software codesign to balance the degree and forms of specialization to the domains or program behaviors of interest. This article provides the first steps towards one approach for automating much of these processes. The insight behind our work is to recognize that decoupled spatial architectures both define a rich design space with many tradeoffs for different kinds of applications, and also can be composed out of a simple set of well-defined primitives. Therefore, we propose a modular accelerator design framework, DAEGEN, a.k.a. Decoupled Access Excution Accelerator Generator. This article defines an initial compiler and architecture primitives, and we discuss key challenges.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; Delays; design automation; Hardware; hardware/software co-design; Kernel; Manuals; Micromechanical devices; Reconfigurable accelerators; spatial architectures; Synchronization", } @Article{Iliakis:2019:LIG, author = "Konstantinos Iliakis and Sotirios Xydis and Dimitrios Soudris", title = "{LOOG}: Improving {GPU} Efficiency With Light-Weight Out-Of-Order Execution", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "166--169", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2951161", ISSN = "1556-6064", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "GPUs are one of the most prevalent platforms for accelerating general-purpose workloads due to their intuitive programming model, computing capacity, and cost-effectiveness. GPUs rely on massive multi-threading and fast context switching to overlap computations with memory operations. Among the diverse GPU workloads, there exists a class of kernels that fail to maintain a sufficient number of active warps to hide the latency of memory operations, and thus suffer from frequent stalling. We observe that these kernels will benefit from increased levels of Instruction-Level Parallelism and we propose a novel architecture with lightweight Out-Of-Order execution capability. To minimize hardware overheads, we carefully design our extension to highly re-use the existing micro-architectural structures. We show that the proposed architecture outperforms traditional platforms by 15 to 46 percent on average for low occupancy kernels, with an area overhead of 0.74 to 3.94 percent. Finally, we prove the potential of our proposal as a GPU u-arch alternative, by providing a 5 percent speedup over a wide collection of 63 general-purpose kernels with as little as 0.74 percent area overhead.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Copper; GPGPU; Graphics processing units; Kernel; micro-architecture; Out of order; Out-of-Order execution; Radio access technologies; Radio frequency; Registers", } @Article{Matsuo:2019:IIF, author = "Reoma Matsuo and Ryota Shioya and Hideki Ando", title = "Improving the Instruction Fetch Throughput with Dynamically Configuring the Fetch Pipeline", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "170--173", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2952592", ISSN = "1556-6064", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Instruction cache misses are the critical performance bottleneck in the execution of recent workloads such as Web applications written in JavaScript and server applications. Although various instruction prefetchers have been proposed to reduce the misses, the requirements for both high miss coverage and small hardware cost are not satisfied. In this article, we propose a novel method that improves the instruction fetch throughput not by instruction prefetching but by dynamically configuring the fetch pipeline structure. Our scheme switches between the normal pipeline and newly introduced miss-assuming pipeline, which does not degrade the fetch throughput even when L1 instruction cache misses occur. Our method achieves high instruction fetch throughput with simple hardware and small cost unlike previously proposed prefetchers. Our evaluation results using Web and database workloads show that our method improves the performance by 16.6 percent and 8.6 percent on average, compared to that with noprefetching and the state-of-the-art instruction prefetcher, PIF, respectively, and achieves as much as 79.0 percent of the performance of the processor with a perfect instruction cache.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Cache storage; Instruction fetch; pipeline implementation; Pipelines; Prefetching; Servers; Throughput", } @Article{Kommareddy:2019:CMS, author = "Vamsee Reddy Kommareddy and Baogang Zhang and Fan Yao and Rickard Ewetz and Amro Awad", title = "Are Crossbar Memories Secure? {New} Security Vulnerabilities in Crossbar Memories", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "174--177", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2952111", ISSN = "1556-6064", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Memristors are emerging Non-Volatile Memories (NVMs) that are promising for building future memory systems. Unlike DRAM, memristors are non-volatile, i.e., they can retain data after power loss. In contrast to DRAM where each cell is associated with a pass transistor, memristor cells can be implemented without such transistor, and hence enable high density ReRAM systems. Moreover, memristors leverage a unique crossbar architecture to improve the density of memory modules. Memristors have been considered to build future data centers with both energy-efficiency and high memory capacity goals. Surprisingly, we observe that using memristors in multi-tenant environments, e.g., cloud systems, entails new security vulnerabilities. In particular, the crossbar contents can severely affect the write latency of any data cells within the same crossbar. With various memory interleaving options (to optimize performance), a single crossbar might be shared among several applications/users from different security domains. Therefore, such content-dependent latency can open new source of information leakage. In this article, we describe the information leakage problem in memristor crossbar arrays (MCAs), discuss how they can be potentially exploited from application level. Our work highlights the need for future research to mitigate (and potentially eliminate) information leakage in crossbar memories in future computing systems.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; Crossbar memory; Memristors; Microprocessors; Nonvolatile memory; Random access memory; ReRAM; Security; security", } @Article{Barber:2019:ISD, author = "Kristin Barber and Anys Bacha and Li Zhou and Yinqian Zhang and Radu Teodorescu", title = "Isolating Speculative Data to Prevent Transient Execution Attacks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "178--181", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2916328", ISSN = "1556-6064", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Hardware security has recently re-surfaced as a first-order concern to the confidentiality protections of computing systems. Meltdown and Spectre introduced a new class of exploits that leverage transient state as an attack surface and have revealed fundamental security vulnerabilities of speculative execution in high-performance processors. These attacks derive benefit from the fact that programs may speculatively execute instructions outside their legal control flows. This insight is then utilized for gaining access to restricted data and exfiltrating it by means of a covert channel. This study presents a microarchitectural mitigation technique for shielding transient state from covert channels during speculative execution. Unlike prior work that has focused on closing individual covert channels used to leak sensitive information, this approach prevents the use of speculative data by downstream instructions until doing so is determined to be safe. This prevents transient execution attacks at a cost of 18 percent average performance degradation.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "covert timing channels; Delays; Hardware security; Law; Pipelines; Registers; Security; Transient analysis; transient execution attacks", } @Article{Kang:2020:NPP, author = "Ki-Dong Kang and Gyeongseo Park and Nam Sung Kim and Daehoon Kim", title = "Network Packet Processing Mode-Aware Power Management for Data Center Servers", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "1--4", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2926079", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Cavus:2020:EPP, author = "Mustafa Cavus and Mohammed Shatnawi and Resit Sendag and Augustus K. Uht", title = "Exploring Prefetching, Pre-Execution and Branch Outcome Streaming for In-Memory Database Lookups", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "5--8", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2959982", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Bodduna:2020:BRS, author = "Rahul Bodduna and Vinod Ganesan and Patanjali SLPSK and Kamakoti Veezhinathan and Chester Rebeiro", title = "{Brutus}: Refuting the Security Claims of the Cache Timing Randomization Countermeasure Proposed in {CEASER}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "9--12", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2964212", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Kim:2020:TSA, author = "Minsub Kim and Jaeha Kung and Sungjin Lee", title = "Towards Scalable Analytics with Inference-Enabled Solid-State Drives", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "13--17", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2019.2930590", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Li:2020:CDE, author = "Congmiao Li and Jean-Luc Gaudiot", title = "Challenges in Detecting an Evasive Spectre", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "18--21", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2976069", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Yan:2020:CUG, author = "Mingyu Yan and Zhaodong Chen and Lei Deng and Xiaochun Ye and Zhimin Zhang and Dongrui Fan and Yuan Xie", title = "Characterizing and Understanding {GCNs} on {GPU}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "22--25", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2970395", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Kumar:2020:PSM, author = "Chanchal Kumar and Aayush Chaudhary and Shubham Bhawalkar and Utkarsh Mathur and Saransh Jain and Adith Vastrad and Eric Rotenberg", title = "Post-Silicon Microarchitecture", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "26--29", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2978841", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Eyerman:2020:BOB, author = "Stijn Eyerman and Wim Heirman and Sam Van den Steen and Ibrahim Hur", title = "Breaking In-Order Branch Miss Recovery", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "30--33", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2980277", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Liu:2020:STA, author = "Zhi-Gang Liu and Paul N. Whatmough and Matthew Mattina", title = "Systolic Tensor Array: an Efficient Structured-Sparse {GEMM} Accelerator for Mobile {CNN} Inference", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "34--37", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2979965", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Krishnan:2020:SLV, author = "Srivatsan Krishnan and Zishen Wan and Kshitij Bhardwaj and Paul Whatmough and Aleksandra Faust and Gu-Yeon Wei and David Brooks and Vijay Janapa Reddi", title = "The Sky Is Not the Limit: a Visual Performance Model for Cyber-Physical Co-Design in Autonomous Machines", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "38--42", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2981022", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Michaud:2020:ETT, author = "Pierre Michaud", title = "Exploiting Thermal Transients With Deterministic Turbo Clock Frequency", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "43--46", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2983920", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Chu:2020:HPD, author = "Zhufei Chu and Huiming Tian and Zeqiang Li and Yinshui Xia and Lunyao Wang", title = "A High-Performance Design of Generalized Pipeline Cellular Array", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "47--50", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2986197", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Zhu:2020:HIR, author = "Lingjun Zhu and Lennart Bamberg and Anthony Agnesina and Francky Catthoor and Dragomir Milojevic and Manu Komalan and Julien Ryckaert and Alberto Garcia-Ortiz and Sung Kyu Lim", title = "Heterogeneous {$3$D} Integration for a {RISC-V} System With {STT-MRAM}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "51--54", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2992644", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Mason:2020:UPI, author = "Tony Mason and Thaleia Dimitra Doudali and Margo Seltzer and Ada Gavrilovska", title = "Unexpected Performance of {Intel Optane DC} Persistent Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "55--58", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2987303", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Zhang:2020:AIG, author = "Zhihui Zhang and Jingwen Leng and Lingxiao Ma and Youshan Miao and Chao Li and Minyi Guo", title = "Architectural Implications of Graph Neural Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "59--62", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2988991", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Sartor:2020:HHL, author = "Anderson L. Sartor and Anish Krishnakumar and Samet E. Arda and Umit Y. Ogras and Radu Marculescu", title = "{HiLITE}: Hierarchical and Lightweight Imitation Learning for Power Management of Embedded {SoCs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "63--67", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2992182", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Desai:2020:PAH, author = "Harsh Desai and Brandon Lucia", title = "A Power-Aware Heterogeneous Architecture Scaling Model for Energy-Harvesting Computers", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "68--71", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2989440", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Lai:2020:TDB, author = "Bo-Cheng Lai and Chun-Yen Chen and Yi-Da Hsin and Bo-Yen Lin", title = "A Two-Directional {BigData} Sorting Architecture on {FPGAs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "72--75", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2993040", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Gu:2020:NTC, author = "Peng Gu and Benjamin S. Lim and Wenqin Huangfu and Krishan T. Malladi and Andrew Chang and Yuan Xie", title = "{NMTSim}: Transaction-Command Based Simulator for New Memory Technology Devices", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "76--79", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2995167", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Rezaei:2020:NNM, author = "Seyyed Hossein SeyyedAghaei Rezaei and Mehdi Modarressi and Rachata Ausavarungnirun and Mohammad Sadrosadati and Onur Mutlu and Masoud Daneshtalab", title = "{NoM}: Network-on-Memory for Inter-Bank Data Transfer in Highly-Banked Memories", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "80--83", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2990599", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2020:IIC, author = "Anonymous", title = "2019 Index {{\booktitle{IEEE Computer Architecture Letters}}} Vol. 18", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "1", pages = "1--8", month = jan # "\slash " # jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2964168", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Ros:2020:EIP, author = "Alberto Ros and Alexandra Jimborean", title = "The Entangling Instruction Prefetcher", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "84--87", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3002947", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Singh:2020:VLB, author = "Rahul Singh and Gokul Subramanian Ravi and Mikko Lipasti and Joshua San Miguel", title = "Value Locality Based Approximation With {ODIN}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "88--91", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3002542", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Zhang:2020:FRP, author = "Jie Zhang and Miryeong Kwon and Sanghyun Han and Nam Sung Kim and Mahmut Kandemir and Myoungsoo Jung", title = "{FastDrain}: Removing Page Victimization Overheads in {NVMe} Storage Stack", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "92--96", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3005507", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Im:2020:PBA, author = "Junsu Im and Hanbyeol Kim and Yumin Won and Jiho Oh and Minjae Kim and Sungjin Lee", title = "Probability-Based Address Translation for Flash {SSDs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "97--100", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3006529", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Samara:2020:CDS, author = "Ahmed Samara and James Tuck", title = "The Case for Domain-Specialized Branch Predictors for Graph-Processing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "101--104", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3005895", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Mirosanlou:2020:MED, author = "Reza Mirosanlou and Danlu Guo and Mohamed Hassan and Rodolfo Pellizzoni", title = "{MCsim}: an Extensible {DRAM} Memory Controller Simulator", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "105--109", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3008288", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Li:2020:DCA, author = "Shang Li and Zhiyuan Yang and Dhiraj Reddy and Ankur Srivastava and Bruce Jacob", title = "{DRAMsim3}: a Cycle-Accurate, Thermal-Capable {DRAM} Simulator", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "106--109", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.2973991", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Lee:2020:SFA, author = "Joo Hwan Lee and Hui Zhang and Veronica Lagrange and Praveen Krishnamoorthy and Xiaodong Zhao and Yang Seok Ki", title = "{SmartSSD}: {FPGA} Accelerated Near-Storage Data Analytics on {SSD}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "110--113", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3009347", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Sutradhar:2020:PPP, author = "Purab Ranjan Sutradhar and Mark Connolly and Sathwika Bavikadi and Sai Manoj Pudukotai Dinakarrao and Mark A. Indovina and Amlan Ganguly", title = "{pPIM}: a Programmable Processor-in-Memory Architecture With Precision-Scaling for Deep Learning", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "118--121", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3011643", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Choe:2020:SMP, author = "Wonkyo Choe and Jonghyeon Kim and Jeongseob Ahn", title = "A Study of Memory Placement on Hardware-Assisted Tiered Memory Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "122--125", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3015613", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Lachtar:2020:CSA, author = "Nada Lachtar and Abdulrahman Abu Elkhail and Anys Bacha and Hafiz Malik", title = "A Cross-Stack Approach Towards Defending Against Cryptojacking", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "126--129", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3017457", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Golshan:2020:HPC, author = "Fatemeh Golshan and Mohammad Bakhshalipour and Mehran Shakerinava and Ali Ansari and Pejman Lotfi-Kamran and Hamid Sarbazi-Azad", title = "Harnessing Pairwise-Correlating Data Prefetching With Runahead Metadata", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "130--133", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3019343", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Lazarev:2020:DTE, author = "Nikita Lazarev and Neil Adit and Shaojie Xiang and Zhiru Zhang and Christina Delimitrou", title = "{Dagger}: Towards Efficient {RPCs} in Cloud Microservices With Near-Memory Reconfigurable {NICs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "134--138", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3020064", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Jahanshahi:2020:GNC, author = "Ali Jahanshahi and Hadi Zamani Sabzi and Chester Lau and Daniel Wong", title = "{GPU-NEST}: Characterizing Energy Efficiency of Multi-{GPU} Inference Servers", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "139--142", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3023723", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Mikhailenko:2020:ASA, author = "Darya Mikhailenko and Yujin Nakamoto and Ben Feinberg and Engin Ipek", title = "Adapting In Situ Accelerators for Sparsity with Granular Matrix Reordering", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "143--146", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3031907", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Ishii:2020:RIP, author = "Yasuo Ishii and Jaekyu Lee and Krishnendra Nathella and Dam Sunwoo", title = "Rebasing Instruction Prefetching: an Industry Perspective", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "147--150", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3035068", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Newton:2020:PGP, author = "Newton and Virendra Singh and Trevor E. Carlson", title = "{PIM-GraphSCC}: {PIM}-Based Graph Processing Using Graph's Community Structures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "151--154", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3039498", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Chowdhury:2020:VNM, author = "Zamshed I. Chowdhury and S. Karen Khatamifard and Zhaoyong Zheng and Tali Moreshet and R. Iris Bahar and Ulya R. Karpuzcu", title = "Voltage Noise Mitigation With Barrier Approximation", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "155--158", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3040088", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Che:2020:LMA, author = "Yuezhi Che and Yuanzhou Yang and Amro Awad and Rujia Wang", title = "A Lightweight Memory Access Pattern Obfuscation Framework for {NVM}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "163--166", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3041484", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Sadredini:2020:ESP, author = "Elaheh Sadredini and Reza Rahimi and Kevin Skadron", title = "Enabling In-{SRAM} Pattern Processing With Low-Overhead Reporting Architecture", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "167--170", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3042194", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Sharifi:2020:AAC, author = "Ferdous Sharifi and Nezam Rohbani and Shaahin Hessabi", title = "Aging-Aware Context Switching in Multicore Processors Based on Workload Classification", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "19", number = "2", pages = "159--162", month = jul # "\slash " # dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3040326", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2021:IIC, author = "Anonymous", title = "2020 Index {{\booktitle{IEEE Computer Architecture Letters}}} Vol. 19", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "1--7", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3048555", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Kwon:2021:FQM, author = "Hyoukjun Kwon and Michael Pellauer and Angshuman Parashar and Tushar Krishna", title = "{Flexion}: a Quantitative Metric for Flexibility in {DNN} Accelerators", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "1--4", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3044607", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Kim:2021:TTR, author = "Byeongho Kim and Jaehyun Park and Eojin Lee and Minsoo Rhu and Jung Ho Ahn", title = "{TRiM}: Tensor Reduction in Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "5--8", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3042805", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Boran:2021:FGS, author = "Nirmal Kumar Boran and Shubhankit Rathore and Meet Udeshi and Virendra Singh", title = "Fine-Grained Scheduling in Heterogeneous-{ISA} Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "9--12", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3045056", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Resch:2021:DLQ, author = "Salonik Resch and Swamit Tannu and Ulya R. Karpuzcu and Moinuddin Qureshi", title = "A Day In the Life of a Quantum Error", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "13--16", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3045628", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Shan:2021:ACP, author = "Mohsin Shan and Omer Khan", title = "Accelerating Concurrent Priority Scheduling Using Adaptive in-Hardware Task Distribution in Multicores", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "17--21", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3045670", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Perais:2021:CSS, author = "Arthur Perais", title = "A Case for Speculative Strength Reduction", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "22--25", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3048694", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Navarro:2021:HSS, author = "Marta Navarro and Lucia Pons and Julio Sahuquillo", title = "{Hy-Sched}: a Simple Hyperthreading-Aware Thread to Core Allocation Strategy", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "26--29", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3051393", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Alian:2021:IOI, author = "Mohammad Alian and Jongmin Shin and Ki-Dong Kang and Ren Wang and Alexandros Daglis and Daehoon Kim and Nam Sung Kim", title = "{IDIO}: Orchestrating Inbound Network Data on Server Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "30--33", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2020.3044923", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Kim:2021:RSD, author = "Hweesoo Kim and Sunjung Lee and Jaewan Choi and Jung Ho Ahn", title = "Row-Streaming Dataflow Using a Chaining Buffer and Systolic Array+ Structure", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "34--37", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3054371", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Kasan:2021:CDB, author = "Hans Kasan and John Kim", title = "The Case for Dynamic Bias in Global Adaptive Routing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "38--41", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3061408", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Shah:2021:TDS, author = "Parth Shah and Ranjal Gautham Shenoy and Vaidyanathan Srinivasan and Pradip Bose and Alper Buyuktosunoglu", title = "{TokenSmart}: Distributed, Scalable Power Management in the Many-Core Era", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "42--45", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3064441", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Li:2021:RRA, author = "Qian Li and Bin Li and Pietro Mercati and Ramesh Illikkal and Charlie Tai and Michael Kishinevsky and Christos Kozyrakis", title = "{RAMBO}: Resource Allocation for Microservices Using {Bayesian} Optimization", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "46--49", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3066142", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Kim:2021:ZCS, author = "Sunghwan Kim and Gyusun Lee and Jiwon Woo and Jinkyu Jeong", title = "Zero-Copying {I/O} Stack for Low-Latency {SSDs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "50--53", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3064876", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Yu:2021:MDC, author = "Chao Yu and Sihang Liu and Samira Khan", title = "{MultiPIM}: a Detailed and Configurable Multi-Stack Processing-In-Memory Simulator", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "54--57", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3061905", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Tan:2021:FQF, author = "Tian Tan and Eriko Nurvitadhi and Aravind Dasu and Martin Langhammer and Derek Chiou", title = "{FlexScore}: Quantifying Flexibility", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "58--4", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3076413", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jul 8 12:08:28 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Sarkar:2021:DDA, author = "Arindam Sarkar and Newton Singh and Varun Venkitaraman and Virendra Singh", title = "{DAM}: Deadblock Aware Migration Techniques for {STT-RAM}-Based Hybrid Caches", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "62--4", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3071717", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jul 8 12:08:28 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Li:2021:HAG, author = "Han Li and Mingyu Yan and Xiaocheng Yang and Lei Deng and Wenming Li and Xiaochun Ye and Dongrui Fan and Yuan Xie", title = "Hardware Acceleration for {GCNs} via Bidirectional Fusion", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "66--4", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3077956", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jul 8 12:08:28 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Jang:2021:DPT, author = "Yongjoo Jang and Sejin Kim and Daehoon Kim and Sungjin Lee and Jaeha Kung", title = "Deep Partitioned Training From Near-Storage Computing to {DNN} Accelerators", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "70--73", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3081752", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jul 8 12:08:28 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Resch:2021:CPC, author = "Salonik Resch and Husrev Cilasun and Ulya R. Karpuzcu", title = "Cryogenic {PIM}: Challenges Opportunities", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "74--77", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3077536", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jul 8 12:08:28 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Heirman:2021:RRC, author = "Wim Heirman and Stijn Eyerman and Kristof {Du Bois} and Ibrahim Hur", title = "{RIO}: {ROB}-Centric In-Order Modeling of Out-of-Order Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "1", pages = "78--81", month = jan # "\slash " # jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3084365", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jul 8 12:08:28 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Amarnath:2021:HAS, author = "Aporva Amarnath and Subhankar Pal and Hiwot Tadese Kassa and Augusto Vega and Alper Buyuktosunoglu and Hubertus Franke and John-David Wellman and Ronald Dreslinski and Pradip Bose", title = "Heterogeneity-Aware Scheduling on {SoCs} for Autonomous Vehicles", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "82--85", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3085505", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Aug 10 15:14:44 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Wang:2021:WWP, author = "Lei Wang and Xingwang Xiong and Jianfeng Zhan and Wanling Gao and Xu Wen and Guoxin Kang and Fei Tang", title = "{WPC}: Whole-Picture Workload Characterization Across Intermediate Representation, {ISA}, and Microarchitecture", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "86--89", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3087828", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Aug 10 15:14:44 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Eyerman:2021:MDT, author = "Stijn Eyerman and Wim Heirman and Ibrahim Hur", title = "Modeling {DRAM} Timing in Parallel Simulators With Immediate-Response Memory Model", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "90--93", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3093075", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Aug 10 15:14:44 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Falahati:2021:DAC, author = "Hajar Falahati and Masoud Peyro and Hossein Amini and Mehran Taghian and Mohammad Sadrosadati and Pejman Lotfi-Kamran and Hamid Sarbazi-Azad", title = "Data-Aware Compression of Neural Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "94--97", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3096191", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Aug 10 15:14:44 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Wu:2021:GOD, author = "Benjamin Wu and Trishita Tiwari and G. Edward Suh and Aaron B. Wagner", title = "Guessing Outputs of Dynamically Pruned {CNNs} Using Memory Access Patterns", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "98--101", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3101505", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Yoo:2021:MBU, author = "Mingi Yoo and Jaeyong Song and Jounghoo Lee and Namhyung Kim and Youngsok Kim and Jinho Lee", title = "Making a Better Use of Caches for {GCN} Accelerators with Feature Slicing and Automatic Tile Morphing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "102--105", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3090954", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Hyun:2021:CAD, author = "Bongjoon Hyun and Jiwon Lee and Minsoo Rhu", title = "Characterization and Analysis of Deep Learning for {3D} Point Cloud Analytics", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "106--109", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3099117", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Rucker:2021:CTB, author = "Alexander Rucker and Muhammad Shahbaz and Kunle Olukotun", title = "Chopping off the Tail: Bounded Non-Determinism for Real-Time Accelerators", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "110--113", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3102224", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Su:2021:EPA, author = "Jiya Su and Linfeng He and Peng Jiang and Rujia Wang", title = "Exploring {PIM} Architecture for High-Performance Graph Pattern Mining", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "114--117", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3103665", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Lee:2021:UIN, author = "Yunjae Lee and Youngeun Kwon and Minsoo Rhu", title = "Understanding the Implication of Non-Volatile Memory for Large-Scale Graph Neural Network Training", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "118--121", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3098943", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Munoz-Martinez:2021:SEC, author = "Francisco Mu{\~n}oz-Mart{\'\i}nez and Jos{\'e} L. Abell{\'a}n and Manuel E. Acacio and Tushar Krishna", title = "{STONNE}: Enabling Cycle-Level Microarchitectural Simulation for {DNN} Inference Accelerators", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "122--125", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3097253", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Shoghi:2021:SSQ, author = "Nima Shoghi and Andrei Bersatti and Moinuddin Qureshi and Hyesoon Kim", title = "{SmaQ}: Smart Quantization for {DNN} Training by Exploiting Value Clustering", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "126--129", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3108505", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Volos:2021:CRA, author = "Haris Volos", title = "The Case for Replication-Aware Memory-Error Protection in Disaggregated Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "130--133", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3110439", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Asheim:2021:BXS, author = "Truls Asheim and Boris Grot and Rakesh Kumar", title = "{BTB-X}: a Storage-Effective {BTB} Organization", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "134--137", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3109945", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Kumar:2021:DDS, author = "Pratik Kumar and Chavhan Sujeet Yashavant and Biswabandan Panda", title = "{DAMARU}: a Denial-of-Service Attack on Randomized Last-Level Caches", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "138--141", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3112180", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Ghasemi:2021:MPE, author = "Fatemeh Ghasemi and Magnus Jahre", title = "Modeling Periodic Energy-Harvesting Computing Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "142--145", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3117031", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Kalani:2021:ICB, author = "Neelu Shivprakash Kalani and Biswabandan Panda", title = "Instruction Criticality Based Energy-Efficient Hardware Data Prefetching", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "146--149", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3117005", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Kim:2021:DSR, author = "Jiho Kim and Myoungsoo Jung and John Kim", title = "Decoupled {SSD}: Reducing Data Movement on {NAND}-Based Flash {SSD}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "150--153", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3118688", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Lee:2021:LPM, author = "Hyeon Gyu Lee and Minwook Kim and Juwon Lee and Eunji Lee and Bryan S. Kim and Sungjin Lee and Yeseong Kim and Sang Lyul Min and Jin-Soo Kim", title = "Learned Performance Model for {SSD}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "154--157", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3120728", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Gurumurthi:2021:HRE, author = "Sudhanva Gurumurthi and Kijun Lee and Munseon Jang and Vilas Sridharan and Aaron Nygren and Yesin Ryu and Kyomin Sohn and Taekyun Kim and Hoeju Chung", title = "{HBM3 RAS}: Enhancing Resilience at Scale", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "158--161", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3117150", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Aimoniotis:2021:RBC, author = "Pavlos Aimoniotis and Christos Sakalis and Magnus Sj{\"a}lander and Stefanos Kaxiras", title = "Reorder Buffer Contention: a Forward Speculative Interference Attack for Speculation Invariant Instructions", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "162--165", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3123408", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Nabavinejad:2021:BLB, author = "Seyed Morteza Nabavinejad and Sherief Reda", title = "{BayesTuner}: Leveraging {Bayesian} Optimization For {DNN} Inference Configuration Selection", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "166--170", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3123695", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Ham:2021:NDP, author = "Hyungkyu Ham and Hyunuk Cho and Minjae Kim and Jueon Park and Jeongmin Hong and Hyojin Sung and Eunhyeok Park and Euicheol Lim and Gwangsun Kim", title = "Near-Data Processing in Memory Expander for {DNN} Acceleration on {GPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "171--174", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3126450", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Liu:2021:SMS, author = "Wenjie Liu and Wim Heirman and Stijn Eyerman and Shoaib Akram and Lieven Eeckhout", title = "Scale-Model Simulation", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "20", number = "2", pages = "175--178", month = jul # "\slash " # dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3133112", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Anonymous:2022:IIC, author = "Anonymous", title = "2021 Index {{\booktitle{IEEE Computer Architecture Letters}}} Vol. 20", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "1", pages = "1--8", month = jan # "\slash " # jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3141948", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Xie:2022:MSS, author = "Xinfeng Xie and Peng Gu and Jiayi Huang and Yufei Ding and Yuan Xie", title = "{MPU-Sim}: a Simulator for In-{DRAM} Near-Bank Processing Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "1", pages = "1--4", month = jan # "\slash " # jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2021.3135557", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Zou:2022:AGP, author = "Mo Zou and Mingzhe Zhang and Rujia Wang and Xian-He Sun and Xiaochun Ye and Dongrui Fan and Zhimin Tang", title = "Accelerating Graph Processing With Lightweight Learning-Based Data Reordering", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "1", pages = "5--8", month = jan # "\slash " # jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3151087", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Barber:2022:PSA, author = "Kristin Barber and Moein Ghaniyoun and Yinqian Zhang and Radu Teodorescu", title = "A Pre-Silicon Approach to Discovering Microarchitectural Vulnerabilities in Security Critical Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "1", pages = "9--12", month = jan # "\slash " # jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3151256", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Lee:2022:MES, author = "Dusol Lee and Duwon Hong and Wonil Choi and Jihong Kim", title = "{MQSim-E}: an Enterprise {SSD} Simulator", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "1", pages = "13--16", month = jan # "\slash " # jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3144773", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Lucas:2022:LHI, author = "Benjamin J. Lucas and Ali Alwan and Marion Murzello and Yazheng Tu and Pengzhou He and Andrew J. Schwartz and David Guevara and Ujjwal Guin and Kyle Juretus and Jiafeng Xie", title = "Lightweight Hardware Implementation of Binary Ring-{LWE} {PQC} Accelerator", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "1", pages = "17--20", month = jan # "\slash " # jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3160394", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Apr 14 17:00:32 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Shin:2022:RSA, author = "Yongwon Shin and Juseong Park and Jeongmin Hong and Hyojin Sung", title = "Runtime Support for Accelerating {CNN} Models on Digital {DRAM} Processing-in-Memory Hardware", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "33--36", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3182363", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Jin:2022:MPC, author = "Hoyong Jin and Donghun Jeong and Taewon Park and Jong Hwan Ko and Jungrae Kim", title = "Multi-Prediction Compression: an Efficient and Scalable Memory Compression Framework for {GP-GPU}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "37--40", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3177419", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Kokkinis:2022:DOC, author = "Argyris Kokkinis and Dionysios Diamantopoulos and Kostas Siozios", title = "Dynamic Optimization of On-Chip Memories for {HLS} Targeting Many-Accelerator Platforms", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "41--44", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3190048", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Yun:2022:GND, author = "Sungmin Yun and Byeongho Kim and Jaehyun Park and Hwayong Nam and Jung Ho Ahn and Eojin Lee", title = "{GraNDe}: Near-Data Processing Architecture With Adaptive Matrix Mapping for Graph Convolutional Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "45--48", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3182387", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Ma:2022:FBA, author = "Rui Ma and Evangelos Georganas and Alexander Heinecke and Sergey Gribok and Andrew Boutros and Eriko Nurvitadhi", title = "{FPGA-Based} {AI} Smart {NICs} for Scalable Distributed {AI} Training Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "49--52", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3189207", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Hameed:2022:DPA, author = "Fazal Hameed and Asif Ali Khan and Sebastien Ollivier and Alex K. Jones and Jeronimo Castrillon", title = "{DNA} Pre-Alignment Filter Using Processing Near Racetrack Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "53--56", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3194263", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Yang:2022:SEP, author = "Ling Yang and Libo Huang and Run Yan and Nong Xiao and Sheng Ma and Li Shen and Weixia Xu", title = "Stride Equality Prediction for Value Speculation", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "57--60", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3195411", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Hong:2022:OMC, author = "Jeongmin Hong and Sungjun Cho and Gwangsun Kim", title = "Overcoming Memory Capacity Wall of {GPUs} With Heterogeneous Memory Stack", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "61--64", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3196932", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Piccolboni:2022:ASS, author = "Luca Piccolboni and Davide Giri and Luca P. Carloni", title = "Accelerators \& Security: The Socket Approach", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "65--68", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3179947", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Yan:2022:CUH, author = "Mingyu Yan and Mo Zou and Xiaocheng Yang and Wenming Li and Xiaochun Ye and Dongrui Fan and Yuan Xie", title = "Characterizing and Understanding {HGNNs} on {GPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "69--72", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3198281", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Accetti:2022:SCE, author = "Cecil Accetti and Rendong Ying and Peilin Liu", title = "Structured Combinators for Efficient Graph Reduction", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "73--76", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3198844", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Omori:2022:OSH, author = "Yu Omori and Keiji Kimura", title = "Open-Source Hardware Memory Protection Engine Integrated With {NVMM} Simulator", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "77--80", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3197777", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Kim:2022:CSD, author = "Minjae Kim and Bryan S. Kim and Eunji Lee and Sungjin Lee", title = "A Case Study of a {DRAM-NVM} Hybrid Memory Allocator for Key--Value Stores", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "81--84", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3197654", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Wang:2022:ISE, author = "Zhengrong Wang and Christopher Liu and Tony Nowatzki", title = "{Infinity Stream}: Enabling Transparent and Automated In-Memory Computing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "85--88", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3203064", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Wu:2022:DCG, author = "Lingxi Wu and Rasool Sharifi and Ashish Venkat and Kevin Skadron", title = "{DRAM-CAM}: General-Purpose Bit-Serial Exact Pattern Matching", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "89--92", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3201168", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Resch:2022:VSQ, author = "Salonik Resch and Ulya Karpuzcu", title = "On Variable Strength Quantum {ECC}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "93--96", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3200204", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Salvesen:2022:LAR, author = "Peter Salvesen and Magnus Jahre", title = "{LMT}: Accurate and Resource-Scalable Slowdown Prediction", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "97--100", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3203483", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Shin:2022:OOS, author = "Gyeongcheol Shin and Junsoo Kim and Joo-Young Kim", title = "{OpenMDS}: an Open-Source Shell Generation Framework for High-Performance Design on {Xilinx} Multi-Die {FPGAs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "101--104", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3202016", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Jalili:2022:MPD, author = "Majid Jalili and Mattan Erez", title = "Managing Prefetchers With Deep Reinforcement Learning", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "105--108", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3210397", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Lenjani:2022:PAH, author = "Marzieh Lenjani and Alif Ahmed and Kevin Skadron", title = "{Pulley}: an Algorithm\slash Hardware Co-Optimization for In-Memory Sorting", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "109--112", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3208255", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Sorting is an important kernel that requires many passes on data, where each pass imposes significant data movement overhead. Processing in memory (PIM) can reduce this data movement overhead while providing high parallelism. The radix sorting algorithm is scalable and can exploit PIM's parallelism. However, this algorithm is inefficient for current PIM-based accelerators for three reasons: (i) requiring a large intermediate array per processing unit, wasting capacity, (ii) requiring a prefix-sum operation across all the large intermediate arrays, imposing performance overhead, and (iii) requiring significant random accesses, which are costly in PIM. In this paper, we propose an algorithm and hardware co-optimization for sorting that enable every group of processing elements to cooperatively share and generate an intermediate array, reducing the capacity overhead of intermediate arrays and performance overhead of the prefix-sum operation. To prevent the shared array from becoming a bottleneck due to random accesses, we eliminate random accesses by adding a local sorting step to the radix sorting and providing efficient hardware support for this step. On average, our hardware/algorithm optimizations, Pulley, deliver 20$ \times $ speedup compared to Bonsai, an FPGA-based sorting accelerator, and 13$ \times $ speedup compared to IMC, an in-logic-layer-based sorting accelerator.", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Zhu:2022:RBP, author = "Yongye Zhu and Shijia Wei and Mohit Tiwari", title = "Revisiting Browser Performance Benchmarking From an Architectural Perspective", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "113--116", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3210483", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Gouk:2022:PHA, author = "Donghyun Gouk and Seungkwan Kang and Miryeong Kwon and Junhyeok Jang and Hyunkyu Choi and Sangwon Lee and Myoungsoo Jung", title = "{PreGNN}: Hardware Acceleration to Take Preprocessing Off the Critical Path in Graph Neural Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "117--120", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3193256", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Wang:2022:CIR, author = "Yinshen Wang and Wenming Li and Tianyu Liu and Liangjiang Zhou and Bingnan Wang and Zhihua Fan and Xiaochun Ye and Dongrui Fan and Chibiao Ding", title = "Characterization and Implementation of Radar System Applications on a Reconfigurable Dataflow Architecture", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "121--124", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3215595", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Hou:2022:CUE, author = "Xiaofeng Hou and Cheng Xu and Jiacheng Liu and Xuehan Tang and Lingyu Sun and Chao Li and Kwang-Ting Cheng", title = "Characterizing and Understanding End-to-End Multi-Modal Neural Networks on {GPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "125--128", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3215718", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Nye:2022:SSS, author = "Jared Nye and Omer Khan", title = "{SSE}: Security Service Engines to Accelerate Enclave Performance in Secure Multicore Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "129--132", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3210149", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Chacon:2022:HTT, author = "Gino A. Chacon and Charles Williams and Johann Knechtel and Ozgur Sinanoglu and Paul V. Gratz", title = "Hardware {Trojan} Threats to Cache Coherence in Modern {2.5D} Chiplet Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "133--136", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3216820", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Eeckhout:2022:FOM, author = "Lieven Eeckhout", title = "A First-Order Model to Assess Computer Architecture Sustainability", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "137--140", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3217366", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Zhou:2022:LPL, author = "Ranyang Zhou and Sepehr Tabrizchi and Arman Roohi and Shaahin Angizi", title = "{LT-PIM}: an {LUT-Based} {Processing-in-DRAM} Architecture With {RowHammer} Self-Tracking", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "141--144", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3220084", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Park:2022:SML, author = "Jongwon Park and Jinkyu Jeong", title = "Speculative Multi-Level Access in {LSM} Tree-Based {KV} Store", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "145--148", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3219808", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Fariborz:2022:MSB, author = "Marjan Fariborz and Mahyar Samani and Terry O'Neill and Jason Lowe-Power and S. J. Ben Yoo and Venkatesh Akella", title = "A Model for Scalable and Balanced Accelerators for Graph Processing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "149--152", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3215489", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Huang:2022:EDC, author = "Jianming Huang and Yu Hua", title = "Ensuring Data Confidentiality in {eADR-Based} {NVM} Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "153--156", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3225949", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Kim:2022:SSE, author = "Sejin Kim and Jungwoo Kim and Yongjoo Jang and Jaeha Kung and Sungjin Lee", title = "{SEMS}: Scalable Embedding Memory System for Accelerating Embedding-Based {DNNs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "21", number = "2", pages = "157--160", month = jul # "\slash " # dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3227560", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Jimenez:2023:LLC, author = "Daniel A. Jim{\'e}nez and Elvira Teran and Paul V. Gratz", title = "Last-Level Cache Insertion and Promotion Policy in the Presence of Aggressive Prefetching", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "1", pages = "17--20", month = jan # "\slash " # jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3242178", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Moon:2023:AAD, author = "Yaebin Moon and Wanju Doh and Kwanhee Kyung and Eojin Lee and Jung Ho Ahn", title = "{ADT}: Aggressive Demotion and Promotion for Tiered Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "1", pages = "21--24", month = jan # "\slash " # jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3236685", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Park:2023:CEE, author = "Gyeongseo Park and Ki-Dong Kang and Minho Kim and Daehoon Kim", title = "{CoreNap}: Energy Efficient Core Allocation for Latency-Critical Workloads", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "1", pages = "1--4", month = jan # "\slash " # jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3227629", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Sim:2023:CCM, author = "Joonseop Sim and Soohong Ahn and Taeyoung Ahn and Seungyong Lee and Myunghyun Rhee and Jooyoung Kim and Kwangsik Shin and Donguk Moon and Euiseok Kim and Kyoung Park", title = "Computational {CXL-Memory} Solution for Accelerating Memory-Intensive Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "1", pages = "5--8", month = jan # "\slash " # jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3226482", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Ringlein:2023:ACD, author = "Burkhard Ringlein and Francois Abel and Dionysios Diamantopoulos and Beat Weiss and Christoph Hagleitner and Dietmar Fey", title = "Advancing Compilation of {DNNs} for {FPGAs} Using Operation Set Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "1", pages = "9--12", month = jan # "\slash " # jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3227643", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Lee:2023:HHF, author = "Seonho Lee and Ranggi Hwang and Jongse Park and Minsoo Rhu", title = "{HAMMER}: Hardware-Friendly Approximate Computing for Self-Attention With Mean-Redistribution and Linearization", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "1", pages = "13--16", month = jan # "\slash " # jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2022.3233832", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Bae:2023:ISF, author = "Hanyeoreum Bae and Donghyun Gouk and Seungjun Lee and Jiseon Kim and Sungjoon Koh and Jie Zhang and Myoungsoo Jung", title = "Intelligent {SSD} Firmware for Zero-Overhead Journaling", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "1", pages = "25--28", month = jan # "\slash " # jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3243695", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Zhao:2023:RAL, author = "Xia Zhao and Guangda Zhang and Lu Wang and Yangmei Li and Yongjun Zhang", title = "{RouteReplies}: Alleviating Long Latency in Many-Chip-Module {GPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "1", pages = "29--32", month = jan # "\slash " # jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3255555", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Weston:2023:SLI, author = "Kevin Weston and Farabi Mahmud and Vahid Janfaza and Abdullah Muzahid", title = "{SmartIndex}: Learning to Index Caches to Improve Performance", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "1", pages = "33--36", month = jan # "\slash " # jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3264478", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Khoram:2023:EEB, author = "Soroosh Khoram and Kyle Daruwalla and Mikko Lipasti", title = "Energy-Efficient {Bayesian} Inference Using Bitstream Computing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "1", pages = "37--40", month = jan # "\slash " # jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3238584", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Brana:2023:KSC, author = "Jennifer Brana and Brian C. Schwedock and Yatin A. Manerkar and Nathan Beckmann", title = "{Kobold}: Simplified Cache Coherence for Cache-Attached Accelerators", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "1", pages = "41--44", month = jan # "\slash " # jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3269399", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Jeon:2023:HAR, author = "Kiseok Jeon and Junghee Lee and Bumsoo Kim and James J. Kim", title = "Hardware Accelerated Reusable {Merkle} Tree Generation for Bitcoin Blockchain Headers", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "69--72", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3289515", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Lee:2023:CDC, author = "Hwanjun Lee and Seunghak Lee and Yeji Jung and Daehoon Kim", title = "{T-CAT}: Dynamic Cache Allocation for Tiered Memory Systems With Memory Interleaving", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "73--76", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3290197", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Jeong:2023:LLA, author = "Ipoom Jeong and Jiaqi Lou and Yongseok Son and Yongjoo Park and Yifan Yuan and Nam Sung Kim", title = "{LADIO}: Leakage-Aware Direct {I/O} for {I/O}-Intensive Workloads", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "77--80", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3290427", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Deshpande:2023:TPB, author = "Chandana S. Deshpande and Arthur Perais and Fr{\'e}d{\'e}ric P{\'e}trot", title = "Toward Practical 128-Bit General Purpose Microarchitectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "81--84", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3287762", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "Intel introduced 5-level paging mode to support 57-bit virtual address space in 2017. This, coupled to paradigms where backup storage can be accessed through load and store instructions (e.g., non volatile memories), lets us envision a future in which a 64-bit address space has become insufficient. In that event, the straightforward solution would be to adopt a flat 128-bit address space. In this early stage letter, we conduct high-level experiments that lead us to suggest a possible general-purpose processor micro-architecture providing 128-bit support with limited hardware cost.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Tzenetopoulos:2023:DLD, author = "Achilleas Tzenetopoulos and Dimosthenis Masouros and Dimitrios Soudris and Sotirios Xydis", title = "{DVFaaS}: Leveraging {DVFS} for {FaaS} Workflows", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "85--88", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3288089", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Nam:2023:XRD, author = "Hwayong Nam and Seungmin Baek and Minbok Wi and Michael Jaemin Kim and Jaehyun Park and Chihun Song and Nam Sung Kim and Jung Ho Ahn", title = "{X}-ray: Discovering {DRAM} Internal Structure and Error Characteristics by Issuing Memory Commands", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "89--92", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3296153", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", abstract = "The demand for accurate information about the internal structure and characteristics of DRAM has been on the rise. Recent studies have explored the structure and characteristics of DRAM to improve processing in memory, enhance reliability, and mitigate a vulnerability known as rowhammer. However, DRAM manufacturers only disclose limited information through official documents, making it difficult to find specific information about actual DRAM devices. This paper presents reliable findings on the internal structure and characteristics of DRAM using activate-induced bitflips (AIBs), retention time test, and row-copy operation. While previous studies have attempted to understand the internal behaviors of DRAM devices, they have only shown results without identifying the causes or have analyzed DRAM modules rather than individual chips. We first uncover the size, structure, and operation of DRAM subarrays and verify our findings on the characteristics of DRAM. Then, we correct misunderstood information related to AIBs and demonstrate experimental results supporting the cause of rowhammer.", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Nematallah:2023:ELS, author = "Ahmed Nematallah and Chang Hyun Park and David Black-Schaffer", title = "Exploring the Latency Sensitivity of Cache Replacement Policies", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "93--96", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3296251", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Mosquera:2023:GCC, author = "Fernando Mosquera and Krishna Kavi and Gayatri Mehta and Lizy John", title = "Guard Cache: Creating Noisy Side-Channels", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "97--100", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3289710", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Mars:2023:JPP, author = "Jason Mars and Yiping Kang and Roland Daynauth and Baichuan Li and Ashish Mahendra and Krisztian Flautner and Lingjia Tang", title = "The {Jaseci} Programming Paradigm and Runtime Stack: Building Scale-Out Production Applications Easy and Fast", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "101--104", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3274038", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Hossain:2023:SDA, author = "Naorin Hossain and Alper Buyuktosunoglu and John-David Wellman and Pradip Bose and Margaret Martonosi", title = "{SoCurity}: a Design Approach for Enhancing {SoC} Security", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "105--108", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3301448", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Feng:2023:SOW, author = "Justin Feng and Fatemeh Arkannezhad and Christopher Ryu and Enoch Huang and Siddhant Gupta and Nader Sehatbakhsh", title = "Simulating Our Way to Safer Software: a Tale of Integrating Microarchitecture Simulation and Leakage Estimation Modeling", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "109--112", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3303913", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Wed Sep 13 17:35:03 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", } @Article{Choi:2023:UPP, author = "Jaewan Choi and Jaehyun Park and Kwanhee Kyung and Nam Sung Kim and Jung Ho Ahn", title = "Unleashing the Potential of {PIM}: Accelerating Large Batched Inference of Transformer-Based Generative Models", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "113--116", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3305386", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "attention; Computational modeling; Context modeling; Decoding; Matrix converters; Memory management; processing-in-memory; Throughput; Transformer-based generative model; Transformers", } @Article{Kim:2023:HAC, author = "Yonghae Kim and Anurag Kar and Jaewon Lee and Jaekyu Lee and Hyesoon Kim", title = "Hardware-Assisted Code-Pointer Tagging for Forward-Edge Control-Flow Integrity", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "117--120", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3306326", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Authentication; Benchmark testing; CFI; Codes; CPT; Hardware; memory safety; Prototypes; RISC-V BOOM; Software; Tagging", } @Article{Saileshwar:2023:MBM, author = "Gururaj Saileshwar and Moinuddin Qureshi", title = "The Mirage of Breaking {MIRAGE}: Analyzing the Modeling Pitfalls in Emerging Attacks on {MIRAGE}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "121--124", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3297875", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Analytical models; Cache side-channel attacks; Ciphers; Codes; Computer bugs; Indexing; randomized caches; Security; Side-channel attacks", } @Article{Lo:2023:LLV, author = "Yun-Chen Lo and Yu-Chih Tsai and Ren-Shuo Liu", title = "{LV}: Latency-Versatile Floating-Point Engine for High-Performance Deep Neural Networks", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "125--128", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3287096", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Adders; Approximate computation; Artificial neural networks; Clocks; Computer architecture; Electric breakdown; Engines; floating point; latency-versatile architecture; Registers", } @Article{Goudarzi:2023:SBP, author = "Maziar Goudarzi and Reza Azimi and Julian Humecki and Faizaan Rehman and Richard Zhang and Chirag Sethi and Tanishq Bomman and Yuqi Yang", title = "By-Software Branch Prediction in Loops", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "129--132", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3304613", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "C.0.b hardware/software interfaces; C.1.1.b pipeline processors; C.1.5.a instruction fetch; Codes; D.3.4.b compilers; Hardware; Monitoring; Optimization; Program processors; Software; Target tracking", } @Article{Yun:2023:FPP, author = "Yugyoung Yun and Eunhyeok Park", title = "Fast Performance Prediction for Efficient Distributed {DNN} Training", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "133--136", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3316452", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "3D parallelism; Costs; Distributed training; large language model; Optimization; Parallel processing; Performance evaluation; performance modeling; Tensors; Throughput; Training", } @Article{Wu:2023:CUD, author = "Meng Wu and Mingyu Yan and Xiaocheng Yang and Wenming Li and Zhimin Zhang and Xiaochun Ye and Dongrui Fan", title = "Characterizing and Understanding Defense Methods for {GNNs} on {GPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "137--140", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3304638", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "defense; Electric breakdown; Estimation; execution pattern; execution semantic; Graph neural networks; Graphics processing units; Kernel; overhead; Perturbation methods; Purification; Training", } @Article{Patel:2023:TIP, author = "Pratyush Patel and Zibo Gong and Syeda Rizvi and Esha Choukse and Pulkit Misra and Thomas Anderson and Akshitha Sriraman", title = "Towards Improved Power Management in Cloud {GPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "141--144", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3278652", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Clocks; Cloud computing; design for power delivery limits; Graphics processing units; graphics processors; Monitoring; Performance evaluation; Power management; Power system management; servers; Servers; super (very large) computers", } @Article{Zhang:2023:BPA, author = "Shiqing Zhang and Mahmood Naderan-Tahan and Magnus Jahre and Lieven Eeckhout", title = "Balancing Performance Against Cost and Sustainability in Multi-Chip-Module {GPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "145--148", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3313203", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Aggregates; Bandwidth; Costs; Graphics processing units; Manufacturing; Sustainable development; Switches", } @Article{Park:2023:DHP, author = "Chanyoung Park and Chun-Yi Liu and Kyungtae Kang and Mahmut Kandemir and Wonil Choi", title = "Design of a High-Performance, High-Endurance Key-Value {SSD} for Large-Key Workloads", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "149--152", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3282276", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Blogs; Data structures; Key-value SSD; large-key workloads; Micromechanical devices; Performance evaluation; Random access memory; Social networking (online); Tail", } @Article{Liu:2023:ILG, author = "Jie Liu and Zhongyuan Zhao and Zijian Ding and Benjamin Brock and Hongbo Rong and Zhiru Zhang", title = "An Intermediate Language for General Sparse Format Customization", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "153--156", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3262610", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Codes; Compilers; Hardware; heterogeneous (hybrid) systems; Indexes; Kernel; Layout; Metadata; sparse linear algebra; specialized application languages; Tensors", } @Article{Lee:2023:NPR, author = "Seunghak Lee and Ki-Dong Kang and Gyeongseo Park and Nam Sung Kim and Daehoon Kim", title = "{NoHammer}: Preventing Row Hammer With Last-Level Cache Management", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "157--160", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3320670", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Degradation; DRAM; Indexes; Last-level cache management; Memory management; Proposals; Random access memory; reliability; Reverse engineering; row hammer; Threat modeling", } @Article{Escofet:2023:HQA, author = "Pau Escofet and Anabel Ovide and Carmen G. Almudever and Eduard Alarc{\'o}n and Sergi Abadal", title = "{Hungarian} Qubit Assignment for Optimized Mapping of Quantum Circuits on Multi-Core Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "161--164", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3318857", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; Computers; Costs; Logic gates; Mapping of quantum algorithms; multi-core quantum computing architectures; Partitioning algorithms; Quantum computing; quantum computing; Qubit", } @Article{Lu:2023:FEA, author = "Lingfei Lu and Yudi Qiu and Shiyan Yi and Yibo Fan", title = "A Flexible Embedding-Aware Near Memory Processing Architecture for Recommendation System", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "165--168", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3305668", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Computer architecture; data partition; Fans; Kernel; near memory processing; Random access memory; Recommendation system; Recommender systems; Social networking (online)", } @Article{Li:2023:HFT, author = "Hailong Li and Jaewan Choi and Yongsuk Kwon and Jung Ho Ahn", title = "A Hardware-Friendly Tiled Singular-Value Decomposition-Based Matrix Multiplication for Transformer-Based Models", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "169--172", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3323482", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computational modeling; GPU; Graphics processing units; Kernel; Matrix decomposition; Natural language processing; Task analysis; tiled singular vector decomposition; Transformer-based model; Transformers", } @Article{Hastings:2023:ASR, author = "Adam Hastings and Ryan Piersma and Simha Sethumadhavan", title = "Architectural Security Regulation", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "173--176", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3327952", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Costs; Games; Government; Modeling techniques; Regulation; Regulators; Safety; Security; security regulation; support for security", } @Article{Trochatos:2023:QCT, author = "Theodoros Trochatos and Chuanqi Xu and Sanjay Deshpande and Yao Lu and Yongshan Ding and Jakub Szefer", title = "A Quantum Computer Trusted Execution Environment", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "177--180", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3325852", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Attenuation; Cloud computing; cloud computing; Computer security; control pulses; Cryptography; dilution refrigerator; Hardware; Logic gates; obfuscation; Quantum computing; quantum computing; Qubit; RF switches", } @Article{Wu:2023:RAI, author = "Peiyun Wu and Trung Le and Zhichun Zhu and Zhao Zhang", title = "Redundant Array of Independent Memory Devices", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "181--184", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3334989", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Data transfer; Error correction codes; Layout; Memory management; Memory systems; mini-rank; multi-bit errors; Organizations; parity; Performance evaluation; redundant array; Standards organizations", } @Article{Garcia-Mallen:2023:TAD, author = "Jonathan Garcia-Mallen and Shuohao Ping and Alex Miralles-Cordal and Ian Martin and Mukund Ramakrishnan and Yipeng Huang", title = "Towards an Accelerator for Differential and Algebraic Equations Useful to Scientists", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "22", number = "2", pages = "185--188", month = jul # "\slash " # dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3332318", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Convergence; Differential equations; Field programmable gate arrays; Hardware; Hyperbolic equations; Iterative methods; iterative methods; reconfigurable hardware; Registers; Scientific computing", } @Article{Vieira:2024:GAP, author = "Jo{\~a}o Vieira and Nuno Roma and Gabriel Falcao and Pedro Tom{\'a}s", title = "{gem5-accel}: a Pre-{RTL} Simulation Toolchain for Accelerator Architecture Validation", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "1--4", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3329443", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "accelerator modeling; Central Processing Unit; complete system emulation; Computer architecture; Hardware acceleration; Kernel; Process control; Random access memory; Registers; Simulation toolchain", } @Article{Gheibi-Fetrat:2024:TTF, author = "Atiyeh Gheibi-Fetrat and Negar Akbarzadeh and Shaahin Hessabi and Hamid Sarbazi-Azad", title = "{Tulip}: Turn-Free Low-Power Network-on-Chip", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "5--8", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3339646", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "area; Chip Multiprocessor (CMP); crossbar; Integrated circuits; Mesh networks; Network topology; Network-on-chip; Network-on-Chip (NoC); power consumption; router; Routing; System recovery; System-on-Chip (SoC); Topology", } @Article{Ueno:2024:ITB, author = "Yosuke Ueno and Yuna Tomida and Teruo Tanimoto and Masamitsu Tanaka and Yutaka Tabuchi and Koji Inoue and Hiroshi Nakamura", title = "Inter-Temperature Bandwidth Reduction in Cryogenic {QAOA} Machines", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "9--12", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3322700", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Computer architecture; cryogenic electronics; Cryogenics; Logic gates; Quantum computing; quantum computing; Qubit; qubit; Superconducting cables; superconducting logic circuits", } @Article{Kim:2024:FAD, author = "Hyeseong Kim and Yunjae Lee and Minsoo Rhu", title = "{FPGA}-Accelerated Data Preprocessing for Personalized Recommendation Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "7--10", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3336841", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Data models; Data preprocessing; data preprocessing; Feature extraction; FPGA; Graphics processing units; Personalized recommendation system; Servers; Throughput; training; Training", } @Article{Peltekis:2024:DDM, author = "Christodoulos Peltekis and Vasileios Titopoulos and Chrysostomos Nicopoulos and Giorgos Dimitrakopoulos", title = "{DeMM}: a Decoupled Matrix Multiplication Engine Supporting Relaxed Structured Sparsity", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "17--20", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3355178", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computational modeling; Engines; Hardware; Indexes; Machine learning accelerator; matrix-multiplication engine; Organizations; Sparse matrices; structured sparsity; Systolic arrays; systolic computation", } @Article{Corontzos:2024:DCD, author = "Caden Corontzos and Eitan Frachtenberg", title = "Direct-Coding {DNA} With Multilevel Parallelism", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "21--24", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3355109", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Decoding; DNA; DNA encoding; Encoding; Genomics; Instruction sets; parallel architectures; Random access memory; Throughput", } @Article{Ayanzadeh:2024:ERR, author = "Ramin Ayanzadeh and Moinuddin Qureshi", title = "Enhancing the Reach and Reliability of Quantum Annealers by Pruning Longer Chains", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "25--28", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3340030", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Adiabatic quantum computing; Annealing; Computers; embedding; Hardware; power-law; quantum annealers; Quantum annealing; Quantum circuit; Quantum computing; Qubit", } @Article{Golden:2024:SVV, author = "Courtney Golden and Dan Ilan and Caroline Huang and Niansong Zhang and Zhiru Zhang and Christopher Batten", title = "Supporting a Virtual Vector Instruction Set on a Commercial Compute-in-{SRAM} Accelerator", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "29--32", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3341389", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; hardware/software interfaces; In-memory computing; Instruction sets; Latches; Microarchitecture; Process control; Programming; Registers", } @Article{Thomas:2024:BMT, author = "Samuel Thomas and Kidus Workneh and Ange-Thierry Ishimwe and Zack McKevitt and Phaedra Curlin and R. Iris Bahar and Joseph Izraelevitz and Tamara Lehman", title = "Baobab {Merkle} Tree for Efficient Secure Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "33--36", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3360709", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; Encryption; encryption; Indexes; integrity; Memory management; Metadata; Protocols; secure memory; Security; System-on-chip", } @Article{Cho:2024:EEA, author = "Minsik Cho and Keivan A. Vahid and Qichen Fu and Saurabh Adya and Carlo C. {Del Mundo} and Mohammad Rastegari and Devang Naik and Peter Zatloukal", title = "{eDKM}: an Efficient and Accurate Train-Time Weight Clustering for Large Language Models", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "37--40", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3363492", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "artificial intelligence; Complexity theory; Computational and artificial intelligence; deep learning; Graphics processing units; Indexes; learning systems; machine learning; Memory; Optimization; Sharding; Tensors", } @Article{Kim:2024:ADR, author = "Yang-Gon Kim and Yun-Ki Han and Jae-Kang Shin and Jun-Kyum Kim and Lee-Sup Kim", title = "Accelerating Deep Reinforcement Learning via Phase-Level Parallelism for Robotics Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "41--44", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3341152", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Backpropagation; Computer systems organization; Graphics processing units; Hardware; Legged locomotion; mobile computing; neural nets; Reinforcement learning; Robots; Training", } @Article{Yang:2024:JIJ, author = "Yuxin Yang and Xiaoming Chen and Yinhe Han", title = "{JANM-IK}: {Jacobian} Argumented {Nelder--Mead} Algorithm for Inverse Kinematics and its Hardware Acceleration", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "45--48", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3369940", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "accelerator; Convergence; End effectors; Field programmable gate arrays; inverse kinematics; Jacobian; Jacobian matrices; Kinematics; nelder-mead; Perturbation methods; Robotics; Robots; software-hardware co-design", } @Article{Hafezan:2024:IEE, author = "Mohammad Hafezan and Ehsan Atoofian", title = "Improving Energy-Efficiency of Capsule Networks on Modern {GPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "49--52", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3365149", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "CapsNet; Computer architecture; energy-efficiency; GPU; Graphics processing units; Hidden Markov models; Instruction sets; Matrix converters; Registers; tensor core; Vectors", } @Article{Nagabhiru:2024:AFP, author = "Mahita Nagabhiru and Gregory T. Byrd", title = "Achieving Forward Progress Guarantee in Small Hardware Transactions", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "53--56", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3370992", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Atomics; Coherence; compare-and-swap; concurrency; Data structures; forward progress; Hardware; hardware transactional memory; Instruction sets; lock-free; multi-word-compare-and-swap; multithreading; non-blocking; Programming; Protocols; Software", } @Article{Ma:2024:PFA, author = "Rui Ma and Jia-Ching Hsu and Ali Mansoorshahi and Joseph Garvey and Michael Kinsner and Deshanand Singh and Derek Chiou", title = "{Primate}: a Framework to Automatically Generate Soft Processors for Network Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "57--60", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3358839", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Codes; Design methodology; domain-specific accelerators; Field programmable gate arrays; flexibility; Libraries; programmability; Registers; Software; Throughput; VLIW", } @Article{France:2024:RSA, author = "Lo{\"\i}c France and Florent Bruguier and David Novo and Maria Mushtaq and Pascal Benoit", title = "Reducing the Silicon Area Overhead of Counter-Based Rowhammer Mitigations", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "61--64", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3328824", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Capacitors; Computer security; DRAM; Proposals; Random access memory; rowhammer; Security; Silicon; Timing; Transistors", } @Article{Yavits:2024:DCD, author = "L. Yavits", title = "{DRAMA}: Commodity {DRAM} Based Content Addressable Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "65--68", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3341830", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "CAM; DNA; DRAM; Hardware; Humanities; Random access memory; Three-dimensional displays; Timing; Voltage", } @Article{Mishra:2024:ASA, author = "Deepanjali Mishra and Konstantinos Kanellopoulos and Ashish Panwar and Akshitha Sriraman and Vivek Seshadri and Onur Mutlu and Todd C. Mowry", title = "Address Scaling: Architectural Support for Fine-Grained Thread-Safe Metadata Management", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "69--72", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3373760", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Complexity theory; Computer bugs; Data structures; dynamic program monitoring tools; Hardware; intermediate address space; Metadata; metadata management; Monitoring; Synthetic aperture sonar; Virtual memory", } @Article{Shin:2024:CMR, author = "Changmin Shin and Taehee Kwon and Jaeyong Song and Jae Hyung Ju and Frank Liu and Yeonkyu Choi and Jinho Lee", title = "A Case for In-Memory Random Scatter--Gather for Fast Graph Processing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "73--77", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3376680", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 2 08:20:13 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Accelerator architectures; Bandwidth; Computer architecture; in-memory computing; memory architecture; Memory management; parallel processing; Protocols; Random access memory; random access memory; Random sequences; Standards", } @Article{Eeckhout:2024:RPG, author = "Lieven Eeckhout", title = "{R.I.P.} Geomean Speedup Use Equal-Work (Or Equal-Time) Harmonic Mean Speedup Instead", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "78--82", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3361925", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 2 08:20:13 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Arithmetic; Average; Benchmark testing; Computer architecture; Harmonic analysis; Measurement; performance metrics; Research and development; speedup; Workstations", } @Article{Jahshan:2024:MMB, author = "Z. Jahshan and L. Yavits", title = "{MajorK}: Majority Based kmer Matching in Commodity {DRAM}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "83--86", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3384259", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 2 08:20:13 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "$K$ mer matching; Bioinformatics; Computer architecture; Databases; DNA; DRAM; genome classification; Genomics; Microprocessors; Random access memory", } @Article{Yi:2024:GSM, author = "Shiyan Yi and Yudi Qiu and Lingfei Lu and Guohao Xu and Yong Gong and Xiaoyang Zeng and Yibo Fan", title = "{GATe}: Streamlining Memory Access and Communication to Accelerate Graph Attention Network With Near-Memory Processing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "87--90", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3386734", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 2 08:20:13 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "DRAM; Fans; Graph attention network; Logic gates; Mathematical models; near memory processing; Optimization; Random access memory; Social networking (online); Vectors", } @Article{Sasmal:2024:AMD, author = "Mrinmay Sasmal and Tresa Joseph and Bindiya T. S.", title = "Approximate Multiplier Design With {LFSR}-Based Stochastic Sequence Generators for Edge {AI}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "91--94", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3379002", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 2 08:20:13 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "approximate multiplier (AM); Artificial neural networks; Computer architecture; Generators; Hardware; linear feedback shift register (LFSR); Long short term memory; long short term memory (LSTM); matrix vector multiplier (MVM); Neural networks; Stochastic processes; Streams", } @Article{Gohil:2024:IGM, author = "Varun Gohil and Sundar Dev and Gaurang Upasani and David Lo and Parthasarathy Ranganathan and Christina Delimitrou", title = "The Importance of Generalizability in Machine Learning for Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "95--98", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3384449", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 2 08:20:13 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Bayes methods; Bayesian neural networks; Computational modeling; Data models; generalizability; Internet; machine learning for systems; Predictive models; Uncertainty; uncertainty estimation", } @Article{Agarwal:2024:UTU, author = "Nikhil Agarwal and Mitchell Fream and Souradip Ghosh and Brian C. Schwedock and Nathan Beckmann", title = "{UDIR}: Towards a Unified Compiler Framework for Reconfigurable Dataflow Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "99--103", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3342130", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 2 08:20:13 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Codes; Computer architecture; Dataflow; Hardware; intermediate representation; Optimization; Program processors; reconfigurable architectures; Semantics; Synchronization", } @Article{Tsantikidou:2024:AEA, author = "Kyriaki Tsantikidou and Nicolas Sklavos", title = "An Area Efficient Architecture of a Novel Chaotic System for High Randomness Security in e-Health", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "104--107", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3387352", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu May 2 08:20:13 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Area efficient architecture; Chaotic communication; chaotic system; Ciphers; Computer architecture; e-health; Encryption; high randomness; key scheduling; NIST; Protocols; Security; security; stream cipher", } @Article{Park:2024:DND, author = "Yongmo Park and Subhankar Pal and Aporva Amarnath and Karthik Swaminathan and Wei D. Lu and Alper Buyuktosunoglu and Pradip Bose", title = "{Dramaton}: a Near-{DRAM} Accelerator for Large Number Theoretic Transforms", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "108--111", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3381452", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Sat Aug 24 09:55:05 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Cryptography; Geometry; Hardware; hardware accelerators; Layout; near-DRAM processing; number theoretic transform; Parallel processing; Post-quantum cryptography; Random access memory; Transforms", } @Article{Luo:2024:RMM, author = "Haocong Luo and Yahya Can Tu{\u{g}}rul and F. Nisa Bostanc{\i} and Ataberk Olgun and A. Giray Ya{\u{g}}l{\i}k{\c{c}}{\i} and Onur Mutlu", title = "{Ramulator 2.0}: a Modern, Modular, and Extensible {DRAM} Simulator", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "112--116", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3333759", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Sat Aug 24 09:55:05 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "C++ languages; Codes; Computer architecture; computer simulation systems engineering and theory; computers and information processing; DRAM systems engineering and theory; Extensibility; memory; memory architecture computers and information processing; memory management computers and information processing; modeling; Organizations; Random access memory; random access memory; scalability; simulation; Software architecture; system analysis and design; system simulation systems engineering and theory; Timing", } @Article{Kim:2024:EIA, author = "Hyungyo Kim and Gaohan Ye and Nachuan Wang and Amir Yazdanbakhsh and Nam Sung Kim", title = "Exploiting {Intel Advanced Matrix Extensions (AMX)} for Large Language Model Inference", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "117--120", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3397747", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Sat Aug 24 09:55:05 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "advance matrix extensions; Arithmetic; Computational modeling; cooperative heterogeneous computing; Data models; Data transfer; Graphics processing units; Large language models; Memory management; Throughput", } @Article{Li:2024:TLV, author = "Tianzheng Li and Enfang Cui and Yuting Wu and Qian Wei and Yue Gao", title = "{TeleVM}: a Lightweight Virtual Machine for {RISC-V} Architecture", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "121--124", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3394835", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Sat Aug 24 09:55:05 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer architecture; Hardware; Hypervisor; Registers; RISC-V; Security; serverless; Virtual machine monitors; Virtual machining; Virtualization; virtualization", } @Article{Qi:2024:AIG, author = "Yingjie Qi and Jianlei Yang and Ao Zhou and Tong Qiao and Chunming Hu", title = "Architectural Implications of {GNN} Aggregation Programming Abstractions", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "125--128", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3326170", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Sat Aug 24 09:55:05 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "characterization; execution patterns; Graph neural networks; Graph neural networks (GNNs); Graphics processing units; Indexes; Kernel; Organizations; Programming; programming abstractions; Taxonomy", } @Article{Khan:2024:EML, author = "Asif Ali Khan and Fazal Hameed and Taha Shahroodi and Alex K. Jones and Jeronimo Castrillon", title = "Efficient Memory Layout for Pre-Alignment Filtering of Long {DNA} Reads Using Racetrack Memory", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "129--132", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3350701", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Sat Aug 24 09:55:05 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bioinformatics; DNA; Domain wall memory; Filtering; Filtering algorithms; Genomics; Layout; near memory computing; racetrack memory; sequence alignment; Sequential analysis", } @Article{Maji:2024:SCP, author = "Saurav Maji and Kyungmi Lee and Anantha P. Chandrakasan", title = "{SparseLeakyNets}: Classification Prediction Attack Over Sparsity-Aware Embedded Neural Networks Using Timing Side-Channel Information", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "133--136", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3397730", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Sat Aug 24 09:55:05 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Architectural attacks; Arrhythmia; Artificial neural networks; classification prediction; Data mining; Electrocardiography; Hardware; neural networks; side-channel attacks; System-on-chip; Timing; timing side-channel", } @Article{Rezaei:2024:SMD, author = "Seyyed Hossein SeyyedAghaei Rezaei and Parham Zilouchian Moghaddam and Mehdi Modarressi", title = "Smart Memory: Deep Learning Acceleration in {3D}-Stacked Memories", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "1", pages = "137--141", month = jan # "\slash " # jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3287976", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Sat Aug 24 09:55:05 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "3D-stacked memory; Artificial neural networks; Bandwidth; Computer architecture; deep learning accelerator; Memory management; Network-on-memory; processing-in-memory; Random access memory; Switches; Three-dimensional displays", } @Article{Katebi:2024:FFV, author = "Hossein Katebi and Navidreza Asadi and Maziar Goudarzi", title = "{FullPack}: Full Vector Utilization for Sub-Byte Quantized Matrix--Vector Multiplication on General Purpose {CPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "142--145", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3370402", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Deep learning; Graphics processing units; hardware acceleration; Load modeling; Memory management; Quantization (signal); Registers; Vectors", } @Article{Alcorta:2024:CML, author = "Erika S. Alcorta and Mahesh Madhav and Richard Afoakwa and Scott Tetrick and Neeraja J. Yadwadkar and Andreas Gerstlauer", title = "Characterizing Machine Learning-Based Runtime Prefetcher Selection", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "146--149", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3404887", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Benchmark testing; Computer architecture; Hardware; machine learning; memory management; Micromechanical devices; parallel architectures; Prefetching; Runtime; Training; Vectors", } @Article{Kakolyris:2024:SAG, author = "Andreas Kosmas Kakolyris and Dimosthenis Masouros and Sotirios Xydis and Dimitrios Soudris", title = "{SLO}-Aware {GPU} {DVFS} for Energy-Efficient {LLM} Inference Serving", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "150--153", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3406038", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Dynamic voltage frequency scaling (DVFS); Frequency measurement; Graphics processing units; inference serving; large language models; Optimization; Predictive models; Servers; system optimization; Tail; Time-frequency analysis", } @Article{Yoon:2024:QAS, author = "Dongho Yoon and Taehun Kim and Jae W. Lee and Minsoo Rhu", title = "A Quantitative Analysis of State Space Model-Based Large Language Model: Study of Hungry Hungry Hippos", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "154--157", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3422492", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computational complexity; Computational modeling; Convolution; GPU; Graphics processing units; h3; large language models; Mathematical models; Memory management; state space model; Vectors", } @Article{Ajdari:2024:EAA, author = "Mohammadamin Ajdari and Behrang Montazerzohour and Kimia Abdi and Hossein Asadi", title = "Empirical Architectural Analysis on Performance Scalability of Petascale All-Flash Storage Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "158--161", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3418874", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Backplanes; Computer architecture; data storage; Hardware; Parallel processing; performance; RAID; Scalability; Solid-state drives; Synthetic aperture sonar; Topology", } @Article{Mohammadpur-Fard:2024:EDM, author = "Ali Mohammadpur-Fard and Sina Darabi and Hajar Falahati and Negin Mahani and Hamid Sarbazi-Azad", title = "Exploiting Direct Memory Operands in {GPU} Instructions", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "162--165", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3371062", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Arithmetic; CISC; Computer architecture; GPGPU; Graphics processing units; Hardware; Reduced instruction set computing; register file; Registers; RISC; Standards", } @Article{Andreu:2024:HAT, author = "Pablo Andreu and Pedro Lopez and Carles Hernandez", title = "Hashing {ATD} Tags for Low-Overhead Safe Contention Monitoring", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "166--169", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3401570", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "ATD; caches; Certification; contention; Hardware; Monitoring; multicore; Multicore processing; Safety; Task analysis; Timing", } @Article{Gurevin:2024:EIR, author = "Deniz Gurevin and Caiwen Ding and Omer Khan", title = "Exploiting Intrinsic Redundancies in Dynamic Graph Neural Networks for Processing Efficiency", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "170--174", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3340504", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Adaptation models; Computational modeling; Data models; Distributed and scalable parallelism; dynamic graphs; Graph neural networks; graph neural networks; Graphics processing units; Kernel; Redundancy", } @Article{Matsuo:2024:TCE, author = "Reoma Matsuo and Toru Koizumi and Hidetsugu Irie and Shuichi Sakai and Ryota Shioya", title = "{TURBULENCE}: Complexity-Effective Out-of-Order Execution on {GPU} With Distance-Based {ISA}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "175--178", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2023.3289317", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Decoding; Dynamic scheduling; Energy efficiency; GPU; Graphics processing units; instruction-level parallelism; Microarchitecture; microarchitecture; Out of order; out-of-order execution; Registers; Relays", } @Article{Lee:2024:ADT, author = "Dongjae Lee and Bongjoon Hyun and Taehun Kim and Minsoo Rhu", title = "Analysis of Data Transfer Bottlenecks in Commercial {PIM} Systems: a Study With {UPMEM--PIM}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "179--182", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3387472", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Bandwidth; Data transfer; Memory management; near-memory processing; parallel architecture; Processing-in-memory (PIM); Random access memory; Runtime library; Software; Throughput", } @Article{Yu:2024:ACP, author = "Seunghyuk Yu and Hyeonu Kim and Kyoungho Jeun and Sunyoung Hwang and Eojin Lee", title = "Architecting Compatible {PIM} Protocol for {CPU--PIM} Collaboration", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "183--186", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3432936", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "and memory scheduling; Generators; memory command; Memory management; memory protocol; Performance evaluation; Processing-in-memory; Protocols; Random access memory; Standards; Vectors", } @Article{Tu:2024:LLT, author = "Yazheng Tu and Pengzhou He and Chip-Hong Chang and Jiafeng Xie", title = "{LTE}: Lightweight and Time-Efficient Hardware Encoder for Post-Quantum Scheme {HQC}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "187--190", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3435495", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Codes; Encoding; Galois fields; Hardware; hardware design; HQC encoder; Logic gates; Polynomials; PQC; Reed--Muller code; Reed--Solomon code", } @Article{Hossam:2024:OCA, author = "Mohamed Hossam and Salah Hessien and Mohamed Hassan", title = "{Octopus}: a Cycle-Accurate Cache System Simulator", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "191--194", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3441941", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Accuracy; Cache coherence; cache memory; Codes; Coherence; Computational modeling; Extensibility; interconnect; Monitoring; Protocols; simulation", } @Article{Baidya:2024:EIK, author = "Paresh Baidya and Rourab Paul and Swagata Mandal and Sumit Kumar Debnath", title = "Efficient Implementation of {Knuth Yao} Sampler on Reconfigurable Hardware", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "195--198", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3454490", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Cryptography; Discrete Gaussian sampling; Field programmable gate arrays; Hamming weight; Hardware; Knuth Yao sampler; lattice based cryptography; Lenses; Polynomials; post-quantum cryptography; Registers", } @Article{Xie:2024:SCB, author = "Rui Xie and Asad {Ul Haq} and Linsen Ma and Krystal Sun and Sanchari Sen and Swagath Venkataramani and Liu Liu and Tong Zhang", title = "{SmartQuant}: {CXL}-Based {AI} Model Store in Support of Runtime Configurable Weight Quantization", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "199--202", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3452699", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Artificial intelligence; Computational modeling; CXL; Generative AI; generative AI; Load modeling; Memory management; Quantization; Quantization (signal); Random access memory", } @Article{Cho:2024:PEC, author = "Haeyoon Cho and Hyojun Son and Jungmin Choi and Byungil Koh and Minho Ha and John Kim", title = "Proactive Embedding on Cold Data for Deep Learning Recommendation Model Training", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "203--206", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3445948", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Backpropagation; Data models; Deep learning; Graphics processing units; Parallel processing; Pipelines; recommendation system; Training; training", } @Article{Ji:2024:APB, author = "Hyesung Ji and Sangpyo Kim and Jaewan Choi and Jung Ho Ahn", title = "Accelerating Programmable Bootstrapping Targeting Contemporary {GPU} Microarchitecture", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "207--210", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3418448", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computational complexity; Cryptography; Fully homomorphic encryption; GPU; Graphics processing units; GSW-based schemes; Kernel; Parallel processing; Polynomials; programmable bootstrapping; Table lookup", } @Article{Degawa:2024:COD, author = "Yuya Degawa and Shota Suzuki and Junichiro Kadomoto and Hidetsugu Irie and Shuichi Sakai", title = "Cycle-Oriented Dynamic Approximation: Architectural Framework to Meet Performance Requirements", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "211--214", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3439318", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Approximate computing; Approximation algorithms; Computer architecture; hardware/software interfaces; Heuristic algorithms; Prediction algorithms; processor architectures; Quality of service; Registers; Task analysis", } @Article{Mahmud:2024:FHI, author = "Md Tareq Mahmud and Ke Wang", title = "A Flexible Hybrid Interconnection Design for High-Performance and Energy-Efficient Chiplet-Based Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "215--218", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3477253", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Broadcasting; Chiplets; chiplets; Computer architecture; Directive antennas; Hardware; hybrid interconnection; Integrated circuit interconnections; network-on-chip (NoC); Receiving antennas; Routing; Switches; Tail; wireless; Wireless communication", } @Article{Ham:2024:OFC, author = "Hyungkyu Ham and Wonhyuk Yang and Yunseon Shin and Okkyun Woo and Guseul Heo and Sangyeop Lee and Jongse Park and Gwangsun Kim", title = "{ONNXim}: a Fast, Cycle-Level Multi-Core {NPU} Simulator", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "219--222", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3484648", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Artificial neural networks; Computational modeling; Deep learning; DNN inference; Kernel; Libraries; multi-tenancy; NPU; ONNX; Random access memory; Runtime; simulator; Systolic arrays; Tensors; Vectors", } @Article{Zhu:2024:SSP, author = "Shizhuo Zhu and Illia Shkirko and Jacob Levinson and Zhengrong Wang and Tony Nowatzki", title = "{SPGPU}: Spatially Programmed {GPU}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "223--226", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3499339", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "GPU; Graphics processing units; Hardware; Instruction sets; Kernel; Message systems; on-chip networks; Optimization; Pipelines; Programming; programming abstractions; Registers; shared-data locality; Spatial architectures; Topology", } @Article{Cho:2024:CAT, author = "Eunyeong Cho and Jehyeon Bang and Minsoo Rhu", title = "Characterization and Analysis of Text-to-Image Diffusion Models", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "227--230", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3466118", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computational modeling; Diffusion model; Diffusion models; Electric breakdown; GPU; Mathematical models; Noise; Noise reduction; sparse attention; Transformers", } @Article{Samandi:2024:CHM, author = "Farid Samandi and Natheesan Ratnasegar and Michael Ferdman", title = "A Case for Hardware Memoization in Server {CPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "231--234", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3505075", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Codes; Costs; Hardware; hardware memoization; Logic; Microarchitecture; Out of order; Pipelines; Program processors; Registers; Servers; Software", } @Article{Cha:2024:GGC, author = "Hanna Cha and Sungchul Lee and Yeonan Ha and Hanhwi Jang and Joonsung Kim and Youngsok Kim", title = "{GCStack}: a {GPU} Cycle Accounting Mechanism for Providing Accurate Insight Into {GPU} Performance", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "235--238", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3476909", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Accuracy; Computer architecture; CPI stack; cycle accounting; Degradation; GPU; Graphics processing units; Hazards; Instruction sets; Micromechanical devices; Pipelines; Synchronization; Tensors", } @Article{Wang:2024:ZEB, author = "Hongtao Wang and Peiquan Jin", title = "{ZoneBuffer}: an Efficient Buffer Management Scheme for {ZNS} {SSDs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "239--242", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3498103", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Buffer management; Clustering algorithms; Optimization; replacement; Technological innovation; ZNS SSDs", } @Article{Coulon:2024:SSA, author = "Samuel Coulon and Tianyou Bao and Jiafeng Xie", title = "{SCALES}: {SCALable} and Area-Efficient Systolic Accelerator for Ternary Polynomial Multiplication", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "243--246", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3505872", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Area-efficient; block-processing; Field programmable gate arrays; FPGA; Matrix decomposition; Polynomials; Process control; Proposals; scalable; Shift registers; Systolic arrays; systolic hardware accelerator; ternary polynomial multiplication; Time complexity; Transforms; Vectors", } @Article{Choudhury:2024:QAS, author = "Navnil Choudhury and Chao Lu and Kanad Basu", title = "Quantum Assertion Scheme for Assuring Qudit Robustness", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "23", number = "2", pages = "247--250", month = jul # "\slash " # dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3483840", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer bugs; Delays; Energy states; Hilbert space; Logic gates; Noise; Quantum assertions; Quantum circuit; Quantum computing; quantum computing; Quantum system; Qubit", } @Article{Bong:2025:IEI, author = "Haseung Bong and Nahyeon Kang and Youngsok Kim and Joonsung Kim and Hanhwi Jang", title = "{IntervalSim++}: Enhanced Interval Simulation for Unbalanced Processor Designs", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "24", number = "1", pages = "1--4", month = jan # "\slash " # jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3514917", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Accuracy; Analytical modeling; Analytical models; Calculators; Computational modeling; Engines; Load modeling; Microarchitecture; Parallel processing; performance evaluation and analysis; Pipelines; simulation; Steady-state", } @Article{Chun:2025:SSA, author = "Myoungjun Chun and Jaeyong Lee and Inhyuk Choi and Jisung Park and Myungsuk Kim and Jihong Kim", title = "{Straw}: a Stress-Aware {WL}-Based Read Reclaim Technique for High-Density {NAND} Flash-Based {SSDs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "24", number = "1", pages = "5--8", month = jan # "\slash " # jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3516205", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Data reliability; Flash memories; NAND flash memory; read disturbance; Reliability; Reliability engineering; Semiconductor device measurement; Solid modeling; solid-state drive (SSD); Stacking; Stress; Target tracking; Three-dimensional displays; Tunneling", } @Article{Vadlamudi:2025:EEI, author = "Chaithanya Krishna Vadlamudi and Bahar Asgari", title = "{Electra}: Eliminating the Ineffectual Computations on Bitmap Compressed Matrices", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "24", number = "1", pages = "9--12", month = jan # "\slash " # jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2024.3516057", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jan 10 10:00:13 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Comput. Archit. Lett.", fjournal = "IEEE Computer Architecture Letters", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Adders; Arithmetic; bitmap; Computational efficiency; Computational modeling; Computer architecture; Corporate acquisitions; Hardware; Logic gates; scheduler; Software; Sparse matrices; sparse problems; SpMSpM", } %%% [02-May-2024] check last issue: papers are added during each half year