%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.58", %%% date = "10 November 2025", %%% time = "17:03:03 MDT", %%% filename = "sigarch.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% FAX = "+1 801 581 4148", %%% URL = "https://www.math.utah.edu/~beebe", %%% checksum = "63952 99953 479337 4537616", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "bibliography; BibTeX; Computer Architecture %%% News; International Symposium on Computer %%% Architecture (ISCA); SIGARCH", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is an almost complete BibTeX %%% bibliography for ACM SIGARCH Computer %%% Architecture News (CODEN CANED2, ISSN %%% 0163-5964 (print), 1943-5851 (electronic)), %%% which began publishing with volume 1, issue %%% 1, in January 1972. The journal appears four %%% to nine times a year, with five annual issues %%% in recent years. Publication ceased with %%% volume 45, number 2, in May 2017. %%% %%% The incompleteness is due to holes in the ACM %%% Portal Database: there are at least 8 issues %%% for which no entry at all is present, or %%% there is an issue Web page, but its contents %%% are empty. The missing issues are: %%% %%% Volume 1 number 1 1972 %%% Volume 1 number 3 1972 %%% Volume 2 number 2 1973 %%% Volume 5 number 3 1976 %%% Volume 5 number 5 1976 %%% Volume 8 number 1 1980 %%% Volume 9 number 3 1981 %%% Volume 36 number 6 2008 %%% %%% The journal has World-Wide Web sites at %%% %%% https://dl.acm.org/newsletter/sigarch %%% http://www.acm.org/sigarch/ %%% http://www.cs.wisc.edu/~arch/www/ %%% %%% with tables of contents at %%% %%% https://dl.acm.org/loi/sigarch %%% %%% Some of the ISCA Conferences are jointly %%% sponsored by the ACM and the IEEE, and also %%% appear as an issue of Computer Architecture %%% News. The first ISCA Conference was held in %%% 1973. Tables of contents of the proceedings %%% volumes, and pointers to online article text, %%% may be available at %%% %%% http://portal.acm.org/browse_dl.cfm?idx=SERIES416 %%% http://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber=30879&isYear=YYYY %%% %%% where YYYY is a four-digit year. %%% %%% At version 1.58, the year coverage looked %%% like this: %%% %%% 1972 ( 8) 1988 ( 103) 2004 ( 77) %%% 1973 ( 35) 1989 ( 116) 2005 ( 115) %%% 1974 ( 44) 1990 ( 114) 2006 ( 99) %%% 1975 ( 9) 1991 ( 119) 2007 ( 97) %%% 1976 ( 53) 1992 ( 87) 2008 ( 90) %%% 1977 ( 56) 1993 ( 64) 2009 ( 91) %%% 1978 ( 33) 1994 ( 70) 2010 ( 104) %%% 1979 ( 16) 1995 ( 59) 2011 ( 108) %%% 1980 ( 67) 1996 ( 51) 2012 ( 113) %%% 1981 ( 18) 1997 ( 51) 2013 ( 132) %%% 1982 ( 81) 1998 ( 129) 2014 ( 120) %%% 1983 ( 74) 1999 ( 54) 2015 ( 69) %%% 1984 ( 60) 2000 ( 69) 2016 ( 135) %%% 1985 ( 67) 2001 ( 56) 2017 ( 112) %%% 1986 ( 66) 2002 ( 69) %%% 1987 ( 87) 2003 ( 63) %%% %%% Article: 3395 %%% Book: 1 %%% InProceedings: 82 %%% Proceedings: 32 %%% %%% Total entries: 3510 %%% %%% This bibliography was constructed primarily %%% from data in the ACM Portal database, and %%% from several on-line library catalogs. The %%% ACM Portal database lacks data for these %%% volume(issue number) pairs: 1(1), 1(3), 2(2), %%% 5(3), 5(5), 8(1), 9(3), 36(6), 37(1), and %%% 41(1). %%% %%% Numerous errors in the sources noted above %%% have been corrected. Spelling has been %%% verified with the UNIX spell and GNU ispell %%% programs using the exception dictionary %%% stored in the companion file with extension %%% .sok. %%% %%% BibTeX citation tags are uniformly chosen as %%% name:year:abbrev, where name is the family %%% name of the first author or editor, year is a %%% 4-digit number, and abbrev is a 3-letter %%% condensation of important title words. %%% Citation labels were automatically generated %%% by software developed for the BibNet Project. %%% %%% In this bibliography, entries are sorted in %%% publication order, with the help of %%% ``bibsort -byvolume''. The bibsort utility %%% is available from %%% %%% https://www.math.utah.edu/pub/bibsort %%% ftp://ftp.math.utah.edu/pub/bibsort %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility.", %%% } %%% ==================================================================== @Preamble{ "\hyphenation{ }" # "\ifx \undefined \circled \def \circled #1{(#1)} \fi" # "\ifx \undefined \reg \def \reg {\circled{R}}\fi" # "\ifx \undefined \TM \def \TM {${}^{\sc TM}$} \fi" } %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, FAX: +1 801 581 4148, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|https://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Journal abbreviations: @String{j-COMP-ARCH-NEWS = "ACM SIGARCH Computer Architecture News"} %%% ==================================================================== %%% Publishers and their addresses: @String{pub-ACM = "ACM Press"} @String{pub-ACM:adr = "New York, NY 10036, USA"} @String{pub-IEEE = "IEEE Computer Society Press"} @String{pub-IEEE:adr = "1109 Spring Street, Suite 300, Silver Spring, MD 20910, USA"} @String{pub-MORGAN-KAUFMANN = "Morgan Kaufmann Publishers"} @String{pub-MORGAN-KAUFMANN:adrsf = "San Francisco, CA, USA"} %%% ==================================================================== %%% Bibliography entries, in publication order: %%% TO DO: [04-Sep-2014] Volume 1 number 1: no data yet in ACM Portal database @Article{Foster:1972:RDM, author = "Caxton C. Foster", title = "A review of dynamic memories with enhanced data access by {Harold S. Stone. IEEETC Vol. C-21, \#4, p 359--386, April 1972}", journal = j-COMP-ARCH-NEWS, volume = "1", number = "2", pages = "3--7", month = apr, year = "1972", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:38 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bataille:1972:SOG, author = "M. Bataille", title = "Something old: the {Gamma 60} the computer that was ahead of its time", journal = j-COMP-ARCH-NEWS, volume = "1", number = "2", pages = "10--15", month = apr, year = "1972", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:38 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Foster:1972:SNI, author = "Caxton C. Foster", title = "Something new: the {Intel MCS-4} micro computer set", journal = j-COMP-ARCH-NEWS, volume = "1", number = "2", pages = "16--17", month = apr, year = "1972", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:38 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:1972:MNC, author = "J. A. N. Lee", title = "My next compiler", journal = j-COMP-ARCH-NEWS, volume = "1", number = "2", pages = "17--19", month = apr, year = "1972", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:38 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Flynn:1972:CAJ, author = "Michael J. Flynn and Mrs. Carol Rogers", title = "Computer architecture at {Johns Hopkins}", journal = j-COMP-ARCH-NEWS, volume = "1", number = "2", pages = "21--33", month = apr, year = "1972", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:38 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } %%% TO DO: [04-Sep-2014] Volume 1 number 3: no data yet in ACM Portal database @Article{Vaughan:1972:CAS, author = "R. F. Vaughan and R. A. Collins", title = "On computer architecture, software portability \& microprogramming", journal = j-COMP-ARCH-NEWS, volume = "1", number = "4", pages = "14--15", month = oct, year = "1972", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Brakefield:1972:OFP, author = "James C. Brakefield", title = "An optimal floating point format", journal = j-COMP-ARCH-NEWS, volume = "1", number = "4", pages = "16--17", month = oct, year = "1972", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Brewer:1972:RDD, author = "J. E. Brewer", title = "Recent doctoral dissertations of interest to {SIGARCH}", journal = j-COMP-ARCH-NEWS, volume = "1", number = "4", pages = "18--20", month = oct, year = "1972", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bettcher:1973:TSR, author = "C. W. Bettcher", title = "Thread standardization and relative cost", journal = j-COMP-ARCH-NEWS, volume = "2", number = "1", pages = "9--9", month = jan, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:28 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "This is a reprint of an article published in the {\em Journal of the Society of Automotive Engineers}, Volume XVIII, Number 2, p. 131, February 1926, about the cost of the lack of standardization of screw threads. {\em Computer Architecture News\/} Editor-in-Chief Caxton C. Foster has added a hand-written note ``of course, there is no message here for {\em us}.''", } @Article{Sites:1973:FPS, author = "Richard L. Sites", title = "Floating point significance interrupt proposal", journal = j-COMP-ARCH-NEWS, volume = "2", number = "1", pages = "10--12", month = jan, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:28 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The purpose of this proposal is to aid numerical analysts in observing the significance of results in floating-point calculations. This proposal is not a cure-all, but it does attempt to a first, high-payoff step in understanding and analyzing floating-point algorithms. This proposal is specifically for IBM 360/370 architecture, but the ideas are applicable to all machines.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "The author observes that register clearing by subtraction is common, and is one of the reasons that ``all IBM language processors execute with significance masked off.'' He proposes suppressing the significance interrupt in subtractions when both operands are equal.", } @Article{Foster:1973:CA, author = "Caxton Foster", title = "Computer architecture", journal = j-COMP-ARCH-NEWS, volume = "2", number = "1", pages = "13--18", month = jan, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:28 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } %%% TO DO: [04-Sep-2014] Volume 2 number 2: no data yet in ACM Portal database @Article{Adler:1973:MCC, author = "Louis S. Adler", title = "A mini-computer configuration for {CAI}: a systems engineering view", journal = j-COMP-ARCH-NEWS, volume = "2", number = "3", pages = "10--19", month = oct, year = "1973", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1216456.1216457", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:31:17 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Computer assisted instruction (CAI) has not impacted the educational world with the degree of success which early proponents predicted. Although CAI has proven to be a more efficient learning tool than common traditional methods in specific instances, the overall success of such systems has been sporadic. There is no question that a well-designed and correctly implemented CAI system can be highly effective; however, several important factors must be overcome to guarantee a reasonable amount of success. These are:\par * Overcoming the present high cost of hardware while still providing a reliable system having acceptable display capability.\par * Developing a software real-time operating system which guarantees fast response times.\par * Authoring high quality courseware.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gentleman:1973:TC, author = "W. M. Gentleman and B. A. Wichmann", title = "Timing on computers", journal = j-COMP-ARCH-NEWS, volume = "2", number = "3", pages = "20--23", month = oct, year = "1973", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1216456.1216458", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:31:17 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", URL = "ftp://ftp.math.utah.edu/pub/mirrors/ftp.ira.uka.de/bibliography/Misc/monitor.bib", abstract = "Most computers today provide some form of clock which can be read by software. The purpose of this note is to illustrate why in many existing systems, the facilities offered are inadequate for ordinary programmers. Proposals are made for changes in both hardware and software to remedy these deficiencies.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Schank:1973:AAS, author = "Karl Schank", title = "Architectural assistance to software debugging aids", journal = j-COMP-ARCH-NEWS, volume = "2", number = "3", pages = "37--38", month = oct, year = "1973", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1216456.1216459", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:31:17 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "It has been observed [1] that 45 to 50\% of programming effort is spent in debugging, checkout and testing, yet the architecture of most modern computer systems does little if anything to facilitate ease of debugging. In most batch systems the programmer is sufficiently removed from the execution of his program as to be severely handicapped in diagnosing errors. There is only so much information that can be easily obtained from a voluminous core dump, for instance. Even programmers on large timesharing systems have available at most an interactive software debugging package which operates through a combination of insertions and replacements of object code and interpretation (rather than execution) of machine code. This can get to be quite inefficient when carried to the extreme and often is useful only if the program has been processed by a special compiler.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bhandarkar:1973:MCM, author = "Dileep P. Bhandarkar and Samuel H. Fuller", title = "{Markov} chain models for analyzing memory interference in multiprocessor computer systems", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "1--6", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anderson:1973:IDP, author = "George A. Anderson", title = "Interconnecting a distributed processor system for avionics", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "11--16", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Goke:1973:BNP, author = "L. Rodney Goke and G. J. Lipovski", title = "{Banyan} networks for partitioning multiprocessor systems", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "21--28", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jordan:1973:SDS, author = "Harry F. Jordan and Burton J. Smith", title = "Structure of digital system description languages", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "31--34", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:1973:VDS, author = "John A. N. Lee", title = "{VDL}---a definition system for all levels", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "41--48", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Radoy:1973:MPP, author = "Charles H. Radoy and George P. {Copeland, Jr.} and G. J. Lipovski", title = "A methodology for parallel processing design tradeoffs", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "51--56", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Reddaway:1973:DDA, author = "S. F. Reddaway", title = "{DAP}---a distributed array processor", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "61--65", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kogge:1973:MRP, author = "Peter M. Kogge", title = "Maximal rate pipelined solutions to recurrence problems", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "71--76", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Agerwala:1973:CCL, author = "Tilak Agerwala and Mike Flynn", title = "Comments on capabilities, limitations and ``correctness'' of {Petri} nets", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "81--86", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Omohundro:1973:FFC, author = "Wayne E. Omohundro and James H. Tracey", title = "{Flowware}---a flow charting procedure to describe digital networks", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "91--97", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Barbacci:1973:AED, author = "Mario R. Barbacci and Daniel P. Siewiorek", title = "Automated exploration of the design space for register transfer {(RT)} systems", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "101--106", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Laliotis:1973:IAS, author = "T. A. Laliotis", title = "Implementation aspects of the symbol hardware compiler", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "111--115", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Copeland:1973:ACC, author = "George P. {Copeland, Jr.} and G. J. Lipovski and Stanley Y. W. Su", title = "The architecture of {CASSM}: a cellular system for non-numeric processing", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "121--128", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hemphill:1973:DDG, author = "John M. Hemphill and S. A. Szygenda", title = "Deriving design guidelines for diagnosable computer systems", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "131--135", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Parhami:1973:DFT, author = "Behrooz Parhami and Algirdas Avizienis", title = "Design of fault-tolerant associative processors", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "141--145", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fischler:1973:FTM, author = "M. A. Fischler and O. Firschein", title = "A fault tolerant multiprocessor architecture for real-time control applications", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "151--157", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lipovski:1973:VFS, author = "G. J. Lipovski", title = "A varistructured fail-soft cellular computer", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "161--165", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vaucher:1973:HLC, author = "Jean Vaucher and Christian Rey", title = "A hardware laboratory for computer architecture research", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "171--175", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Knoke:1973:SEC, author = "P. J. Knoke", title = "Simulation exercises for computer architecture education", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "181--185", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sloan:1973:CAC, author = "M. E. Sloan", title = "Computer architecture courses in electrical engineering departments", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "191--195", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hartenstein:1973:IHC, author = "R. Hartenstein", title = "Increasing hardware complexity---a challenge to computer architecture education", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "201--206", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rossmann:1973:RWC, author = "George Rossmann", title = "Review of the {{\em Workshop on Computer Architecture Education}}", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "211--214", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cooper:1973:MMB, author = "Richard G. Cooper", title = "{Micromodules}: Microprogrammable building blocks for hardware development", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "221--226", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fuller:1973:CMA, author = "S. H. Fuller and D. P. Siewiorek and R. J. Swan", title = "Computer Modules: an architecture for large digital modules", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "231--237", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zaks:1973:MAF, author = "Rodnay Zaks", title = "A microprogrammed architecture for front end processing", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "241--246", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vranesic:1973:DFV, author = "Z. G. Vranesic and V. C. Hamacher and Y. Y. Leung", title = "Design of a fully variable-length structured minicomputer", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "251--255", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Marvel:1973:HHA, author = "Orin E. Marvel", title = "Happe {Honeywell Associative Parallel Processing Ensemble}", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "261--267", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Schaffner:1973:CAP, author = "Mario R. Schaffner", title = "A computer architecture and its programming language", journal = j-COMP-ARCH-NEWS, volume = "2", number = "4", pages = "271--277", month = dec, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shore:1974:CCa, author = "John Shore", title = "Conjecture corner", journal = j-COMP-ARCH-NEWS, volume = "3", number = "1", pages = "3--6", month = mar, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McKeeman:1974:CDE, author = "W. M. McKeeman", title = "Computer design evaluation using programming language primitives", journal = j-COMP-ARCH-NEWS, volume = "3", number = "1", pages = "7--18", month = mar, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hartenstein:1974:LMI, author = "Reiner W. Hartenstein", title = "Letter to membership from incoming chairman {(CAN, Oct. 73)}", journal = j-COMP-ARCH-NEWS, volume = "3", number = "1", pages = "19--22", month = mar, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stryker:1974:SSA, author = "David Stryker and David Weiss", title = "Secure system architecture", journal = j-COMP-ARCH-NEWS, volume = "3", number = "2", pages = "37--38", month = jun, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Su:1974:BRL, author = "Stephen Y. H. Su", title = "Book review of {{\em Logic and Logic Design\/}} by {B. Girling and H. G. Morning. International Textbook Company Limited 1973}", journal = j-COMP-ARCH-NEWS, volume = "3", number = "3", pages = "2--3", month = sep, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:02 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shore:1974:CCb, author = "John Shore", title = "Conjecture corner", journal = j-COMP-ARCH-NEWS, volume = "3", number = "3", pages = "4--9", month = sep, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:02 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nisnevich:1974:DPC, author = "L. Nisnevich and E. Strasbourger", title = "Decentralized priority control in data communication", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "1--6", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Reames:1974:LNS, author = "Cecil C. Reames and Ming T. Liu", title = "A loop network for simultaneous transmission of variable-length messages", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "7--12", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Callan:1974:APS, author = "James F. Callan", title = "The architecture of the {Picture System}", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "13--16", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "Evans \& Sutherland Picture System", } @Article{Staudhammer:1974:FDO, author = "John Staudhammer and Jeffrey F. Eastman and James N. England", title = "A fast display-oriented processor", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "17--22", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Eastman:1974:CDC, author = "Jeffrey F. Eastman and John Staudhammer", title = "Computer display of colored three-dimensional objects", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "23--27", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kerr:1974:MPI, author = "Henry D. Kerr", title = "A microprogrammed processor for interactive computer graphics", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "28--33", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Armstrong:1974:FMT, author = "C. V. W. Armstrong", title = "Functional memory techniques applied to the microprogrammed control of an associative processor", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "34--40", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wade:1974:IDM, author = "James F. Wade and Paul D. Stigall", title = "Instruction design to minimize program size", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "41--44", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bondi:1974:HHM, author = "James O. Bondi and Paul D. Stigall", title = "{HMO}, a hardware microcode optimizer", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "45--51", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Peskin:1974:CAD, author = "A. M. Peskin", title = "The computer aided design of processor architectures", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "51--55", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Huen:1974:IPR, author = "W. H. Huen and D. P. Siewiorek", title = "Intermodule protocol for register transfer level modules: representation and analytic tools", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "56--62", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Isaacson:1974:PSP, author = "Portia Isaacson", title = "Picture systems, {PS}, and the design of a channel-to-channel computer interface", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "63--70", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lofgren:1974:RCT, author = "Lennart L{\"o}fgren", title = "Reference concepts in a tree structured address space", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "71--79", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anderson:1974:VMM, author = "Judith A. Anderson and G. J. Lipovski", title = "A virtual memory for microprocessors", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "80--84", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Brundage:1974:PED, author = "R. E. Brundage and A. P. Batson", title = "The performance enhancement of descriptor-based virtual memory systems through the use of associative registers", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "85--90", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Marvel:1974:SSP, author = "Orin E. Marvel", title = "{SPEAC}: special purpose electronic area correlator", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "91--94", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Satterfield:1974:AAS, author = "James M. Satterfield", title = "Architectural advances of the space shuttle orbiter avionics computer system", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "95--98", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kodres:1974:DSA, author = "Uno R. Kodres and William L. McCracken", title = "Design study of an avionics navigation microcomputer", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "99--105", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kane:1974:ISI, author = "Gerald R. Kane", title = "An iteratively structured information processor", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "106--112", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Richards:1974:HSI, author = "H. {Richards, Jr.} and A. E. Oldehoeft", title = "Hardware-software interactions in {SYMBOL-2R}'s operating system", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "113--118", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sylvain:1974:DEA, author = "Pierre Sylvain and Maniel Vineberg", title = "The design and evaluation of the array machine: a high-level language processor", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "119--125", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dennis:1974:PAB, author = "Jack B. Dennis and David P. Misunas", title = "A preliminary architecture for a basic data-flow processor", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "126--132", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Berkling:1974:RLR, author = "K. J. Berkling", title = "Reduction languages for reduction machines", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "133--140", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{King:1974:ODS, author = "Willis K. King and Fulvio Carbonaro", title = "Output devices sharing by minicomputers", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "141--145", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rannem:1974:RSC, author = "S. Rannem and V. C. Hamacher and S. G. Zaky and P. Connolly", title = "On relating small computer performance to design parameters", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "146--151", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lawson:1974:ASH, author = "Harold W. {Lawson, Jr.} and Bengt Magnhagen", title = "Advantages of structured hardware", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "152--158", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kornerup:1974:CMS, author = "Peter Kornerup", title = "Concepts of the {MATHILDA} system", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "159--164", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Foster:1974:S, author = "Caxton C. Foster", title = "{SOCRATES}", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "165--169", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wann:1974:CCS, author = "Donald F. Wann and Robert A. Ellis", title = "Conjoined computer systems: an architecture for laboratory data processing and instrument control", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "170--175", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jensen:1974:DFC, author = "E. Douglas Jensen", title = "A distributed function computer for real-time control", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "176--182", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Radoy:1974:SMI, author = "C. H. Radoy and G. J. Lipovski", title = "Switched multiple instruction, multiple data stream processing", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "183--187", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lechner:1974:SED, author = "Robert J. Lechner", title = "Sequentially encoded data structures that support bidirectional scanning", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "188--194", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Freeman:1974:ICE, author = "Martin Freeman", title = "An instruction class for an extensible interpreter", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "195--200", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Giloi:1974:SCC, author = "W. K. Giloi and H. Berg", title = "{STARLET}: a computer concept based on ordered sets as primitive data types", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "201--206", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cornell:1974:CGP, author = "R. G. Cornell and H. C. Torng", title = "A cellular general purpose computer", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "207--213", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Goldstein:1974:MOR, author = "Barry C. Goldstein and Thomas W. Scrutchin", title = "A machine-oriented resource management architecture", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "214--219", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sloan:1974:DOC, author = "M. E. Sloan", title = "A design-oriented computer engineering program", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "220--224", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Baron:1974:ELC, author = "Janis Beitch Baron and D. E. Atkins", title = "An educational laboratory in contemporary digital design", journal = j-COMP-ARCH-NEWS, volume = "3", number = "4", pages = "225--231", month = dec, year = "1974", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smith:1975:ACFa, author = "W. R. Smith", title = "{AADC} computer family architecture program", journal = j-COMP-ARCH-NEWS, volume = "4", number = "1", pages = "4--8", month = mar, year = "1975", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lunde:1975:MDW, author = "{\AA}mund Lunde", title = "More data on the {O/W} ratios: a note on a paper by {Flynn}", journal = j-COMP-ARCH-NEWS, volume = "4", number = "1", pages = "9--13", month = mar, year = "1975", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lipovski:1975:NNA, author = "G. Jack Lipovski and Stanley Y. W. and Sr", title = "On non-numeric architecture", journal = j-COMP-ARCH-NEWS, volume = "4", number = "1", pages = "14--29", month = mar, year = "1975", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Boulaye:1975:SDS, author = "Guy. G. Boulaye", title = "Structured design for structured computer architecture", journal = j-COMP-ARCH-NEWS, volume = "4", number = "2", pages = "8--17", month = jun, year = "1975", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:42 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Parnas:1975:ECA, author = "D. L. Parnas", title = "Evaluation criteria for abstract machines with unknown applications", journal = j-COMP-ARCH-NEWS, volume = "4", number = "3", pages = "2--9", month = sep, year = "1975", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:02 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Special issue: The AADC computer family architecture project", } @Article{Smith:1975:ACFb, author = "William R. Smith", title = "{AADC} computer family architecture questions and answers", journal = j-COMP-ARCH-NEWS, volume = "4", number = "3", pages = "15--21", month = sep, year = "1975", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:02 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Special issue: The AADC computer family architecture project", } @Article{Su:1975:ICC, author = "Stephen Y. H. Su", title = "An introduction to {CHDL} (computer hardware description languages)", journal = j-COMP-ARCH-NEWS, volume = "4", number = "3", pages = "22--23", month = sep, year = "1975", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:02 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Doran:1975:ICL, author = "R. W. Doran", title = "The {International Computers Ltd. ICL2900} computer architecture", journal = j-COMP-ARCH-NEWS, volume = "4", number = "3", pages = "24--47", month = sep, year = "1975", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:02 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bell:1976:CSW, author = "Gordon Bell and William D. Strecker", title = "Computer structures: {What} have we learned from the {PDP-11}?", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "1--14", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kerner:1976:PLL, author = "Helmut Kerner and Werner Beyerle", title = "A {PMS} level language for performance evaluation modelling {(V-PMS)}", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "15--19", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Moalla:1976:DTM, author = "M. Moalla and G. Saucier and J. Sifakis and M. Zachariades", title = "A design tool for the multilevel description and simulation of systems of interconnected modules", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "20--27", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Allen:1976:CCS, author = "Jonathan Allen", title = "A course in computer structures", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "28--32", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rossmann:1976:ICS, author = "George E. Rossmann", title = "The {IEEE Computer Society} task force on computer architecture", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "33--33", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Widdoes:1976:MMM, author = "Lawrence C. {Widdoes, Jr.}", title = "The {Minerva} multi-microprocessor", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "34--39", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Arnold:1976:HRM, author = "R. G. Arnold and E. W. Page", title = "A hierarchical, restructurable multi-microprocessor architecture", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "40--45", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McGill:1976:MAN, author = "Robert McGill and John Steinhoff", title = "A multimicroprocessor approach to numerical analysis: {An} application to gaming problems", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "46--51", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jensen:1976:MIS, author = "John E. Jensen and Jean-Loup Baer", title = "A model of interference in a shared resource multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "52--57", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Leung:1976:CSF, author = "Clement K. C. Leung and David P. Misunas and Andrij Neczwid and Jack B. Dennis", title = "A computer simulation facility for packet communication architecture", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "58--63", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rege:1976:CPS, author = "S. L. Rege", title = "Cost, performance and size tradeoffs for different levels in a memory hierarchy", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "64--67", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dworak:1976:IIR, author = "Paul E. Dworak and Alice C. Parker", title = "An input interface for a real-time digital sound generation system", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "68--73", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mulder:1976:MOD, author = "Michael C. Mulder and Patrick P. Fasang", title = "A microprocessor oriented data acquisition and control system for power system control", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "74--78", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gladney:1976:MRT, author = "H. M. Gladney and G. Hochweller", title = "Multiprogramming for real-time applications", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "79--85", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kehl:1976:BAH, author = "Theodore H. Kehl", title = "{Basil} architecture --- an {HLL} minicomputer", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "86--92", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lawson:1976:FDC, author = "Harold W. {Lawson, Jr.}", title = "Function distribution in computer system architectures", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "93--97", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vissers:1976:IDA, author = "Chris A. Vissers", title = "Interface, a dispersed architecture", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "98--104", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thomasian:1976:DSS, author = "A. Thomasian and A. Avizienis", title = "A design study of a shared resource computing system", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "105--112", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ford:1976:HSI, author = "W. S. Ford and V. C. Hamacher", title = "Hardware support for inter-process communication and processor sharing", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "113--118", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Trambacz:1976:TDP, author = "Ulrich Trambacz and Georg Hyla", title = "A taxonomy of display processors", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "119--120", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kluge:1976:TBT, author = "W. E. Kluge", title = "Traversing binary tree structures with shift register memories (recent results)", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "121.1--121.1", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fernandez:1976:ASS, author = "Eduardo B. Fernandez and Rita C. Summers and Charles D. Coleman", title = "Architectural support for system protection (recent results)", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "121.2--121.2", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gault:1976:DUP, author = "James W. Gault and Alice C. Parker", title = "The design of a user-programmable digital interface (recent results)", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "121.3--121.3", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fournier:1976:SDG, author = "Serge Fournier and Ming T. Liu", title = "System design of a grammar-programmable high-level language machine", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "122.4--122.4", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kuznia:1976:SSM, author = "Ch. Kuznia and R. Kober and H. Kopp", title = "{SMS 101} --- a structured multi microprocessor system with deadlock-free operation scheme", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "122.5--122.5", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Liu:1976:SSD, author = "Philip S. Liu and Frederic J. Mowle", title = "Selection schemes for dynamically microcoding {Fortran} programs", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "122.6--122.6", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fuller:1976:DMM, author = "S. H. Fuller and D. P. Siewiorek and R. J. Swan", title = "The design of a multi-micro-computer system", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "123--123", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Reames:1976:DSD, author = "Cecil C. Reames and Ming T. Liu", title = "Design and simulation of the distributed loop computer network {(DLCN)}", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "124--129", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Franchi:1976:DFC, author = "Paolo Franchi", title = "Distribution of functions and control in {RPCNET}", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "130--135", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wittie:1976:EMR, author = "Larry D. Wittie", title = "Efficient message routing in {Mega-Micro-Computer} networks", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "136--140", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Welch:1976:IDO, author = "Terry A. Welch", title = "An investigation of descriptor oriented architecture", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "141--146", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Feustel:1976:TAS, author = "E. A. Feustel", title = "Tagged architecture and the semantics of programming languages: {Extensible} types", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "147--150", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Batson:1976:DDA, author = "A. P. Batson and R. E. Brundage and J. P. Kearns", title = "Design data for {Algol-60} machines", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "151--154", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Strecker:1976:CMP, author = "William D. Strecker", title = "Cache memories for {PDP-11} family computers", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "155--158", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Patel:1976:ITP, author = "Janak H. Patel and Edward S. Davidson", title = "Improving the throughput of a pipeline by insertion of delays", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "159--164", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Abd-Alla:1976:LAT, author = "A. M. Abd-Alla and Laird H. Moffett", title = "On-line architecture tuning using microcapture", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "165--171", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Healy:1976:COC, author = "Leonard D. Healy", title = "A character-oriented context-addressed segment-sequential storage", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "172--177", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bush:1976:SIS, author = "J. A. Bush and G. J. Lipovski and S. Y. W. su and J. K. Watson and S. J. Ackerman", title = "Some implementations of segment sequential functions", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "178--185", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{DeMartinis:1976:SMS, author = "Manlio DeMartinis and G. Jack Lipovski and Stanley Y. W. Su and J. K. Watson", title = "A {Self Managing Secondary Memory} system", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "186--194", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fuller:1976:PPC, author = "Samuel H. Fuller", title = "Price\slash performance comparison of {C.mmp} and the {PDP-10}", journal = j-COMP-ARCH-NEWS, volume = "4", number = "4", pages = "195--202", month = jan, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorelli:1976:RAC, author = "Lars-Erik Thorelli", title = "Representation of arrays in computers", journal = j-COMP-ARCH-NEWS, volume = "5", number = "1", pages = "6--9", month = apr, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Berndt:1976:ECA, author = "Helmut Berndt", title = "Evolutionary computer architecture: the {Unidata 7.000} series", journal = j-COMP-ARCH-NEWS, volume = "5", number = "1", pages = "10--16", month = apr, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dennis:1976:CAC, author = "Jack B. Dennis", title = "Computer architecture and the cost of software", journal = j-COMP-ARCH-NEWS, volume = "5", number = "1", pages = "17--21", month = apr, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lindamood:1976:NCA, author = "George Lindamood", title = "On navel contemplation and the art of computer maintenance", journal = j-COMP-ARCH-NEWS, volume = "5", number = "1", pages = "22--23", month = apr, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fuller:1976:IMS, author = "S. H. Fuller and G. A. Mathew", title = "Implementing microprogram storage with {PLA}'s", journal = j-COMP-ARCH-NEWS, volume = "5", number = "2", pages = "6--11", month = jun, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:42 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hicks:1976:GQS, author = "D. R. Hicks", title = "A generalized queue scheme for process synchronization and communication", journal = j-COMP-ARCH-NEWS, volume = "5", number = "2", pages = "12--14", month = jun, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:42 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Langdon:1976:BRR, author = "Glen G. Langdon", title = "Book reviews: Review of {{\em Introduction to Computer Architecture\/}} by {Harold S. Stone}", journal = j-COMP-ARCH-NEWS, volume = "5", number = "2", pages = "17--19", month = jun, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:42 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } %%% TO DO: [04-Sep-2014] Volume 5 number 3: no data yet in ACM Portal database @Article{Thurber:1976:ANR, author = "Kenneth J. Thurber", title = "{ARPS}: a new real-time computer", journal = j-COMP-ARCH-NEWS, volume = "5", number = "4", pages = "6--16", month = oct, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:09 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Salisbury:1976:MMC, author = "Alan B. Salisbury", title = "{MCF}: a military computer family for computer-based systems", journal = j-COMP-ARCH-NEWS, volume = "5", number = "4", pages = "17--20", month = oct, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:09 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ris:1976:UDF, author = "Frederic N. Ris", title = "A unified decimal floating-point architecture for the support of high-level languages", journal = j-COMP-ARCH-NEWS, volume = "5", number = "4", pages = "21--31", month = oct, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:09 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lipovski:1976:QS, author = "G. Jack Lipovski", title = "A question of style", journal = j-COMP-ARCH-NEWS, volume = "5", number = "4", pages = "32--38", month = oct, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:09 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chroust:1976:DIV, author = "G. Chroust", title = "Data interfaces versus control interfaces: a half-baked conjecture", journal = j-COMP-ARCH-NEWS, volume = "5", number = "4", pages = "39--40", month = oct, year = "1976", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:09 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } %%% TO DO: [04-Sep-2014] Volume 5 number 5: no data yet in ACM Portal database @Article{Langdon:1977:CFM, author = "Glen G. Langdon", title = "Considerations on the ``figure of merit'' technique for storage hierarchy design", journal = j-COMP-ARCH-NEWS, volume = "5", number = "6", pages = "25--28", month = feb, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:28 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Miller:1977:BRRb, author = "Edward F. Miller", title = "Book Reviews: Review of {{\em High-Level Language Computer Architecture\/}} by {Yaohan Chu. Academic Press, New York, 1975}", journal = j-COMP-ARCH-NEWS, volume = "5", number = "6", pages = "29--29", month = feb, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:28 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chu:1977:AHD, author = "Yaohan Chu", title = "Architecture of a hardware data interpreter", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "1--9", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dasgupta:1977:DSL, author = "Subrata Dasgupta", title = "The design of some language constructs for horizontal microprogramming", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "10--16", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jensen:1977:HMM, author = "E. Douglas Jensen and Richard Y. Kain", title = "The {Honeywell Modular Microprogram Machine}: {M3}", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "17--28", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ramseyer:1977:MMI, author = "Richard R. Ramseyer and Andries van Dam", title = "A multi-microprocessor implementation of a general purpose pipelined {CPU}", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "29--34", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ravi:1977:HMS, author = "C. V. Ravi and Torben Moller", title = "A hierarchical microcomputer system for hardware and software development", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "35--40", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Harris:1977:HMO, author = "J. Archer Harris and David R. Smith", title = "Hierarchical multiprocessor organizations", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "41--48", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hurakami:1977:PPS, author = "K. Hurakami and S. Nishikawa and M. Sato", title = "Poly-Processor {System} analysis and design", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "49--56", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mazare:1977:FEH, author = "Guy Mazare", title = "A few examples of how to use a symmetrical multi-micro-processor", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "57--62", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kogge:1977:MPP, author = "Peter M. Kogge", title = "The microprogramming of pipelined processors", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "63--69", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Siegel:1977:UVT, author = "Howard Jay Siegel", title = "The universality of various types of {SIMD} machine interconnection networks", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "70--79", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rau:1977:EIF, author = "Ramakrishna B. Rau and George E. Rossmann", title = "The effect of instruction fetch strategies upon the performance of pipelined instruction units", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "80--89", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ahuja:1977:MMS, author = "S. R. Ahuja and J. R. Jump", title = "A modular memory scheme for array processing", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "90--94", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Haynes:1977:AAC, author = "Leonard S. Haynes", title = "The architecture of an {ALGOL 60} computer implemented with distributed processors", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "95--104", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sullivan:1977:LSHa, author = "Herbert Sullivan and T. R. Bashkow", title = "A large scale, homogeneous, fully distributed parallel machine, {I}", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "105--117", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sullivan:1977:LSHb, author = "Herbert Sullivan and Theodore R. Bashkow and David Klappholz", title = "A Large Scale, Homogeneous, Fully Distributed Parallel Machine, {II}", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "118--124", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lipovski:1977:VMM, author = "G. Jack Lipovski", title = "On virtual memories and micronetworks", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "125--134", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Strauss:1977:CNT, author = "Jon C. Strauss and Kenneth J. Thurber", title = "Considerations for new tactical computer systems", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "135--140", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thurber:1977:ATC, author = "Kenneth J. Thurber and Peter C. Patton and Robert C. Deward and Jon C. Strauss and Thomas W. Petschauer", title = "An advanced tactical computer concept", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "141--146", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nutt:1977:MIP, author = "Gary J. Nutt", title = "Microprocessor implementation of a parallel processor", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "147--152", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dworak:1977:DIR, author = "Paul Dworak and Alice C. Parker and Richard Blum", title = "The design and implementation of a real-time sound generation system", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "153--158", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Parker:1977:HST, author = "A. C. Parker and A. W. Nagle", title = "Hardware\slash software tradeoffs in a variable word width, variable queue length buffer memory", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "159--164", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Peuto:1977:ITM, author = "Bernard L. Peuto and Leonard J. Shustek", title = "An instruction timing model of {CPU} performance", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "165--178", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hoogendoorn:1977:RMI, author = "Cornelis H. Hoogendoorn", title = "Reduction of memory interference in multiprocessor systems", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "179--183", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hammerstrom:1977:ICC, author = "D. W. Hammerstrom and E. S. Davidson", title = "Information content of {CPU} memory referencing behavior", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "184--192", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Liu:1977:MCP, author = "Ming T. Liu and Cecil C. Reames", title = "Message communication protocol and operating system design for the {Distributed Loop Computer Network (DLCN)}", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "193--200", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Poujoulat:1977:ACB, author = "G. H. Poujoulat", title = "Architecture of the {CORAIL} building block system", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "201--204", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tredennick:1977:HSB, author = "H. L. Tredennick and T. A. Welch", title = "High-speed buffering for variable length operands", journal = j-COMP-ARCH-NEWS, volume = "5", number = "7", pages = "205--210", month = mar, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Steel:1977:AGP, author = "Rod Steel", title = "Another general purpose computer architecture", journal = j-COMP-ARCH-NEWS, volume = "5", number = "8", pages = "5--11", month = apr, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lindamood:1977:WN, author = "George E. Lindamood", title = "What's in a name?", journal = j-COMP-ARCH-NEWS, volume = "5", number = "8", pages = "12--14", month = apr, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Schneiker:1977:MF, author = "Conrad Schneiker", title = "The microprocessors of the future", journal = j-COMP-ARCH-NEWS, volume = "5", number = "8", pages = "15--16", month = apr, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Miller:1977:BRR, author = "Edward F. {Miller, Jr.}", title = "Book review: Review of {{\em Large-Scale Computer Architecture: Parallel and Associative Processors\/}} by {Kenneth J. Thurber, Hayden Book Company, Rochelle Park, New Jersey 1976}", journal = j-COMP-ARCH-NEWS, volume = "5", number = "8", pages = "17--17", month = apr, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Conner:1977:IOC, author = "William M. Conner and Edward R. Dirling", title = "Input\slash Output considerations in look-ahead processing", journal = j-COMP-ARCH-NEWS, volume = "6", number = "1", pages = "7--12", month = jun, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rosin:1977:SM, author = "Robert F. Rosin", title = "The significance of microprogramming", journal = j-COMP-ARCH-NEWS, volume = "6", number = "1", pages = "14--19", month = jun, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gonzalez:1977:BRR, author = "Mario J. Gonzalez", title = "Book review: Review of {{\em Microprogramming Primer\/}} by {Harry Katzan, Jr., McGraw-Hill 1977}", journal = j-COMP-ARCH-NEWS, volume = "6", number = "1", pages = "29--30", month = jun, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vineberg:1977:ICS, author = "Maniel Vineberg", title = "Implementation of character string pattern matching on a multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "6", number = "2", pages = "1--7", month = may, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bird:1977:APP, author = "R. M. Bird and J. C. Tu and R. M. Worthy", title = "Associative\slash parallel processors for searching very large textual data bases", journal = j-COMP-ARCH-NEWS, volume = "6", number = "2", pages = "8--9", month = may, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lipovski:1977:IFT, author = "G. J. Lipovski", title = "On imaginary fields, token transfers and floating codes in intelligent secondary memories", journal = j-COMP-ARCH-NEWS, volume = "6", number = "2", pages = "17--22", month = may, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zaky:1977:MNN, author = "S. G. Zaky", title = "Microprocessors for non-numeric processing", journal = j-COMP-ARCH-NEWS, volume = "6", number = "2", pages = "23--30", month = may, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hsiao:1977:ADC, author = "David K. Hsiao and Krishnamurthi Kannan", title = "The architecture of a database computer --- a summary", journal = j-COMP-ARCH-NEWS, volume = "6", number = "2", pages = "31--33", month = may, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rosenthal:1977:DMM, author = "Robert S. Rosenthal", title = "The data management machine, a classification", journal = j-COMP-ARCH-NEWS, volume = "6", number = "2", pages = "35--39", month = may, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McDonell:1977:TNS, author = "Ken J. McDonell", title = "Trends in non-software support for input-output functions", journal = j-COMP-ARCH-NEWS, volume = "6", number = "2", pages = "40--47", month = may, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cerretti:1977:UIP, author = "R. Cerretti and D. Jasilli and D. R. Matteucci", title = "{Ulisse}: {An Italian} project for a multifunctional terminal system", journal = j-COMP-ARCH-NEWS, volume = "6", number = "2", pages = "48--50", month = may, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bray:1977:DMR, author = "Olin H. Bray", title = "Data management requirements: {The} similarity of memory management, database systems, and message processing", journal = j-COMP-ARCH-NEWS, volume = "6", number = "2", pages = "68--76", month = may, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Landson:1977:CSA, author = "Barry M. Landson and Robert G. Sargent", title = "A comparison of sequential and associate computing of priority queues", journal = j-COMP-ARCH-NEWS, volume = "6", number = "2", pages = "77--78", month = may, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Myers:1977:CAS, author = "Glenford J. Myers", title = "The case against stack-oriented instruction sets", journal = j-COMP-ARCH-NEWS, volume = "6", number = "3", pages = "7--10", month = aug, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tanenbaum:1977:AMA, author = "Andrew S. Tanenbaum", title = "Ambiguous machine architecture and program efficiency", journal = j-COMP-ARCH-NEWS, volume = "6", number = "3", pages = "11--13", month = aug, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hicks:1977:MCA, author = "D. R. Hicks", title = "Microprogramming with a content-addressable read-only-memory", journal = j-COMP-ARCH-NEWS, volume = "6", number = "3", pages = "14--15", month = aug, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hicks:1977:MPS, author = "D. R. Hicks", title = "Multitasking as a program structuring primitive", journal = j-COMP-ARCH-NEWS, volume = "6", number = "3", pages = "16--18", month = aug, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chroust:1977:BRR, author = "G. Chroust", title = "Book reviews: Review of {{\em Digital System Implementation\/}} by {Gerrit A. Blaauw, Prentice Hall, Series in Automatic Computation 1976}", journal = j-COMP-ARCH-NEWS, volume = "6", number = "4", pages = "27--28", month = oct, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:09 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hagan:1977:VMS, author = "R. A. Hagan and C. S. Wallace", title = "A virtual memory system for the {Hewlett Packard 2100A}", journal = j-COMP-ARCH-NEWS, volume = "6", number = "5", pages = "5--13", month = dec, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:17 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Baskett:1977:MMF, author = "Forest Baskett", title = "More on microprocessors of the future", journal = j-COMP-ARCH-NEWS, volume = "6", number = "5", pages = "14--17", month = dec, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:17 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chu:1977:DEC, author = "Yaohan Chu", title = "Direct-execution computer architecture", journal = j-COMP-ARCH-NEWS, volume = "6", number = "5", pages = "18--23", month = dec, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:17 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Schulthess:1977:RCA, author = "Peter U. Schulthess and Eduard P. Mumprecht", title = "Reply to the case against stack-oriented instruction sets", journal = j-COMP-ARCH-NEWS, volume = "6", number = "5", pages = "24--27", month = dec, year = "1977", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:17 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mountain:1978:AMC, author = "John B. Mountain and Philip H. Enslow", title = "Application of the military computer family architecture selection criteria to the {PR1ME P400}", journal = j-COMP-ARCH-NEWS, volume = "6", number = "6", pages = "3--17", month = feb, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:28 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lipovski:1978:JFM, author = "G. Jack Lipovski", title = "Just a few more words on microprocessors of the future", journal = j-COMP-ARCH-NEWS, volume = "6", number = "6", pages = "18--21", month = feb, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:28 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Keedy:1978:USE, author = "J. L. Keedy", title = "On the use of stacks in the evaluation of expressions", journal = j-COMP-ARCH-NEWS, volume = "6", number = "6", pages = "22--28", month = feb, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:28 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tanenbaum:1978:RPA, author = "Andrew S. Tanenbaum", title = "Review of {{\em Processor Architecture\/}} by {S. H. Lavington, NCC Publications, Manchester 1976}", journal = j-COMP-ARCH-NEWS, volume = "6", number = "6", pages = "31--31", month = feb, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:28 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Whiteside:1978:BRR, author = "A. E. Whiteside", title = "Book reviews: Review of {{\em The Architecture of Concurrent Programs\/}} by {Per Brinch Hansen, Prentice-Hall 1977}", journal = j-COMP-ARCH-NEWS, volume = "6", number = "6", pages = "32--32", month = feb, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:28 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bhandarkar:1978:STT, author = "Dileep P. Bhandarkar and J. Egil Juliussen", title = "Semiconductor technology: trends and implications", journal = j-COMP-ARCH-NEWS, volume = "7", number = "1", pages = "4--14", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Payne:1978:CCD, author = "A. J. Payne", title = "A computer console design to help the operator", journal = j-COMP-ARCH-NEWS, volume = "7", number = "1", pages = "15--22", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McGlynn:1978:RCA, author = "Daniel R. McGlynn", title = "Review of {{\em Content Addressable Parallel Processors\/}} by {Caxton C. Foster. Van Nostrand Reinhold Co. 1976}", journal = j-COMP-ARCH-NEWS, volume = "7", number = "1", pages = "23--23", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ramamoorthy:1978:RSC, author = "C. V. Ramamoorthy", title = "Review of {{\em Structured Computer Organization\/}} by {Andrew S. Tanenbaum, Prentice-Hall 1976}", journal = j-COMP-ARCH-NEWS, volume = "7", number = "1", pages = "23--23", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Buchholz:1978:RCS, author = "W. Buchholz", title = "Review of {{\em Computer System Architecture\/}} by {M. Morris Mano, Prentice-Hall 1976}", journal = j-COMP-ARCH-NEWS, volume = "7", number = "1", pages = "24--24", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vranesic:1978:BRR, author = "Z. G. Vranesic", title = "Book reviews: Review of {{\em Content Addressable Parallel Processors\/}} by {Caxton C. Foster, Van Nostrand Reinhold Co. 1976}", journal = j-COMP-ARCH-NEWS, volume = "7", number = "1", pages = "24--24", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Korfhage:1978:DPU, author = "R. R. Korfhage and W. H. E. Day and L. L. Beck and W. F. Appelbe", title = "Data physics: an unorthodox view of data and its implications in data processors", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "1--7", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Copeland:1978:SSS, author = "George P. Copeland", title = "String storage and searching for data base applications: implementation on the {INDY} backend kernel", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "8--17", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Otis:1978:ERD, author = "Allen J. Otis and George P. Copeland", title = "Editing requirements for data base applications and their implementation on the {INDY} backend kernel", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "18--29", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lipovski:1978:SPI, author = "G. Jack Lipovski", title = "Semantic paging on intelligent discs", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "30--34", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Williams:1978:MSD, author = "Rhon Williams", title = "A multiprocessing system for the direct execution of {LISP}", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "35--41", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bird:1978:TFI, author = "R. M. Bird and J. B. Newsbaum and J. L. Trefftzs", title = "Text file inversion: an evaluation", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "42--50", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Roberts:1978:SCA, author = "David C. Roberts", title = "A specialized computer architecture for text retrieval", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "51--59", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stucki:1978:CCA, author = "M. J. Stucki and J. R. Cox and G. C. Roman and P. N. Turcu", title = "Coordinating concurrent access in a distributed database architecture", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "60--64", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gouda:1978:HCC, author = "Mohamed G. Gouda", title = "A hierarchical controller for concurrent accessing of distributed databases", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "65--70", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gavish:1978:EAD, author = "Bezalel Gavish and Harvey Koch", title = "An extensible architecture for data flow processing", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "71--76", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Harvill:1978:FPO, author = "J. B. Harvill", title = "Functional parallelism in an operand state saving computer", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "77--84", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hutchison:1978:MM, author = "J. S. Hutchison and W. G. Roman", title = "Madman machine", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "85--90", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Banerjee:1978:UDM, author = "Jayanta Banerjee and David K. Hsiao", title = "The use of a database machine for supporting relational databases", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "91--98", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sadowski:1978:EPR, author = "Paul J. Sadowski and S. A. Schuster", title = "Exploiting parallelism in a {Relational Associative Processor}", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "99--109", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chang:1978:BRD, author = "Hsu Chang", title = "Bubbles for relational database", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "110--116", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{ElMasri:1978:MIR, author = "A. {El Masri} and J. Rohmer and D. Tusera", title = "A machine for information retrieval", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "117--120", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Matteucci:1978:DSA, author = "Dante R. Matteucci", title = "A distributed structure for the automization of the {Catalog of the National Cultural Heritage}: experiences and proposals", journal = j-COMP-ARCH-NEWS, volume = "7", number = "2", pages = "121--133", month = aug, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thurber:1978:CCT, author = "Kenneth J. Thurber", title = "Computer communication techniques", journal = j-COMP-ARCH-NEWS, volume = "7", number = "3", pages = "7--16", month = oct, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:02 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jennings:1978:VP, author = "Hal W. Jennings", title = "A variation on the {PDP 11}", journal = j-COMP-ARCH-NEWS, volume = "7", number = "3", pages = "17--26", month = oct, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:02 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hansen:1978:MAC, author = "Per Brinch Hansen", title = "Multiprocessor architectures for concurrent programs", journal = j-COMP-ARCH-NEWS, volume = "7", number = "4", pages = "4--23", month = dec, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Keedy:1978:EEU, author = "J. L. Keedy", title = "On the evaluation of expressions using accumulators, stacks and store-to-store instructions", journal = j-COMP-ARCH-NEWS, volume = "7", number = "4", pages = "24--27", month = dec, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chattergy:1978:CL, author = "Rahul Chattergy", title = "In the current literature", journal = j-COMP-ARCH-NEWS, volume = "7", number = "4", pages = "30--30", month = dec, year = "1978", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cragon:1979:ECS, author = "Harvey G. Cragon", title = "An evaluation of code space requirements and performance of various architectures", journal = j-COMP-ARCH-NEWS, volume = "7", number = "5", pages = "5--21", month = feb, year = "1979", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thurber:1979:BLC, author = "Kenneth J. Thurber and Harvey A. Freeman", title = "A bibliography of local computer network architectures", journal = j-COMP-ARCH-NEWS, volume = "7", number = "5", pages = "22--27", month = feb, year = "1979", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cox:1979:NCA, author = "Lyle A. {Cox, Jr.}", title = "The nature of ``computer architecture''", journal = j-COMP-ARCH-NEWS, volume = "7", number = "7", pages = "8--12", month = apr, year = "1979", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{vandeSnepscheut:1979:INP, author = "Jan L. A. van de Snepscheut and Gert A. Slavenburg", title = "Introducing the notion of processes to hardware", journal = j-COMP-ARCH-NEWS, volume = "7", number = "7", pages = "13--23", month = apr, year = "1979", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Atkins:1979:RAC, author = "D. E. Atkins", title = "Review of {{\em Advances in Computer Architecture\/}} by {Glenford J. Myers. Wiley-Interscience Division of John Wiley and Sons 1978}", journal = j-COMP-ARCH-NEWS, volume = "7", number = "7", pages = "25--26", month = apr, year = "1979", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bowyer:1979:BRS, author = "Kevin W. Bowyer", title = "Book review of {{\em The Structure of Computers and Computations: Volume One\/}} by {David J. Kuck. John Wiley \& Sons 1978}", journal = j-COMP-ARCH-NEWS, volume = "7", number = "7", pages = "27--30", month = apr, year = "1979", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gibson:1979:TOR, author = "Randall Gibson and Paul Anderson", title = "Technical overview of the {Renaissance Octobus} system", journal = j-COMP-ARCH-NEWS, volume = "7", number = "8", pages = "2--9", month = jun, year = "1979", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stevenson:1979:EEM, author = "Johan W. Stevenson and Andrew S. Tanenbaum", title = "Efficient encoding of machine instructions", journal = j-COMP-ARCH-NEWS, volume = "7", number = "8", pages = "10--17", month = jun, year = "1979", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Keedy:1979:MUS, author = "J. L. Keedy", title = "More on the use of stacks in the evaluation of expressions", journal = j-COMP-ARCH-NEWS, volume = "7", number = "8", pages = "18--22", month = jun, year = "1979", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Quick:1979:IMP, author = "G. E. Quick", title = "Intelligent memory: ``a parallel processing concept''", journal = j-COMP-ARCH-NEWS, volume = "7", number = "8", pages = "23--28", month = jun, year = "1979", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rivest:1979:BCA, author = "Ronald L. Rivest", title = "The {BLIZZARD} computer architecture", journal = j-COMP-ARCH-NEWS, volume = "7", number = "9", pages = "2--10", month = aug, year = "1979", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Keedy:1979:TPR, author = "J. L. Keedy", title = "A technique for passing reference parameters in an information-hiding architecture", journal = j-COMP-ARCH-NEWS, volume = "7", number = "9", pages = "11--15", month = aug, year = "1979", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kavipurapu:1979:QAU, author = "Krishna M. Kavipurapu and Dennis J. Frailey", title = "Quantification of architectures using software science", journal = j-COMP-ARCH-NEWS, volume = "7", number = "10", pages = "2--6", month = oct, year = "1979", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Turton:1979:PHS, author = "Trevor Turton", title = "A proposed high-speed computer design", journal = j-COMP-ARCH-NEWS, volume = "7", number = "10", pages = "7--21", month = oct, year = "1979", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Staff:1979:CL, author = "{Computer Architecture News} staff", title = "In the current literature", journal = j-COMP-ARCH-NEWS, volume = "7", number = "10", pages = "22--22", month = oct, year = "1979", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } %%% TO DO: [04-Sep-2014] Volume 8 number 1: no data yet in ACM Portal database @Article{Richards:1980:CE, author = "Dana Richards", title = "On a {``Counter--Example''}", journal = j-COMP-ARCH-NEWS, volume = "8", number = "2", pages = "2--3", month = apr, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Denning:1980:WIC, author = "Peter J. Denning", title = "Why not innovations in computer architecture?", journal = j-COMP-ARCH-NEWS, volume = "8", number = "2", pages = "4--7", month = apr, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gerrity:1980:HDU, author = "G. W. Gerrity", title = "Hardware detection of undefined references", journal = j-COMP-ARCH-NEWS, volume = "8", number = "2", pages = "8--11", month = apr, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Denning:1980:MCS, author = "Peter J. Denning and T. Don Dennis", title = "On minimizing contention at semaphores", journal = j-COMP-ARCH-NEWS, volume = "8", number = "2", pages = "12--19", month = apr, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dennis:1980:BBD, author = "Jack B. Dennis and G. Andrew Boughton and Clement K. C. Leung", title = "Building blocks for data flow prototypes", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "1--8", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Davidson:1980:MSM, author = "Edward S. Davidson", title = "A multiple stream microprocessor prototype system: {AMP-1}", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "9--16", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Andre:1980:KAO, author = "F. Andre and J. P. Ban{\^a}tre and H. Leroy and G. Paget and F. Ployette and J. P. Routeau", title = "{KENSUR}: An architecture oriented towards programming languages translation", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "17--22", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kuhl:1980:DFT, author = "J. G. Kuhl and S. M. Reddy", title = "Distributed fault-tolerance for large multiprocessor systems", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "23--30", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Malek:1980:CCA, author = "Miroslaw Malek", title = "A comparison connection assignment for diagnosis of multiprocessor systems", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "31--36", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Grosspietsch:1980:CTR, author = "K. E. Grosspietsch and J. Kaiser and E. Nett", title = "A concept for test and reconfiguration of a fault-tolerant {VLSI} processor system", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "37--43", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Brassard:1980:PBC, author = "Jean-Paul Brassard and Jan Gecsei", title = "Path building in cellular partitioning networks", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "44--50", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McMillen:1980:MMC, author = "Robert J. McMillen and Howard Jay Siegel", title = "{MIMD} machine communication using the augmented data manipulator network", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "51--60", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shen:1980:FTC, author = "John P. Shen and John P. Hayes", title = "Fault tolerance of a class of connecting networks", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "61--71", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Coffman:1980:CBS, author = "E. G. {Coffman, Jr.} and Kimming So", title = "On the comparison between single and multiple processor systems", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "72--79", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hamacher:1980:PCF, author = "V. Carl Hamacher and Gerald S. Shedler", title = "Performance of a collision-free local bus network having asynchronous distributed control", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "80--87", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zuberek:1980:TPN, author = "W. M. Zuberek", title = "Timed {Petri} nets and preliminary performance evaluation", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "88--96", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ditzel:1980:RHL, author = "David R. Ditzel and David A. Patterson", title = "Retrospective on high-level language computer architecture", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "97--104", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sansonnet:1980:MLD, author = "J. P. Sansonnet and M. Castan and C. Percebois", title = "{M3L}: a list-directed architecture", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "105--112", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hibino:1980:PPG, author = "Yasushi Hibino", title = "A Practical Parallel Garbage Collection Algorithm and Its Implementation", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "113--120", month = may, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", URL = "ftp://ftp.math.utah.edu/pub/mirrors/ftp.ira.uka.de/bibliography/Compiler/garbage.collection.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "Hardware assisted GC", } @Article{Treleaven:1980:MPR, author = "Philip C. Treleaven and Geoffrey F. Mole", title = "A multi-processor reduction machine for user-defined reduction languages", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "121--130", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tobias:1980:SUM, author = "Jeffrey M. Tobias", title = "A single user multiprocessor incorporating processor manipulation facilities", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "131--138", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Halstead:1980:MSD, author = "Robert H. {Halstead, Jr.} and Stephen A. Ward", title = "The {MuNet}: a scalable decentralized architecture for parallel computation", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "139--145", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lampson:1980:PHP, author = "Butler W. Lampson and Kenneth A. Pier", title = "A processor for a high-performance personal computer", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "146--160", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Edwards:1980:MGN, author = "D. B. G. Edwards and A. E. Knowles and J. V. Woods", title = "{MU6-G}: a new design to achieve mainframe performance from a mini-sized computer", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "161--167", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Batcher:1980:AMP, author = "Kenneth E. Batcher", title = "Architecture of a massively parallel processor", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "168--173", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Palmer:1980:IND, author = "John Palmer", title = "The {Intel 8087} numeric data processor", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "174--181", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kuhn:1980:EMA, author = "Robert H. Kuhn", title = "Efficient mapping of algorithms to single-stage interconnections", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "182--189", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nassimi:1980:SRB, author = "David Nassimi and Sartaj Sahni", title = "A self routing {Benes} network", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "190--195", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{vonIssendorff:1980:ANF, author = "H. von Issendorff and W. Gr{\"u}newald", title = "An adaptable network for functional distributed systems", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "196--201", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Riad:1980:CFC, author = "Mokhtar Boshra Riad", title = "A combination of field and current access techniques for efficient and cost-effective bubble memories", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "202--210", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Trivedi:1980:DLS, author = "K. S. Trivedi", title = "Designing linear storage hierarchies so as to maximize reliability subject to cost and performance constraints", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "211--217", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ahuja:1980:APP, author = "Sudhir R. Ahuja and Charles S. Roberts", title = "An associative\slash parallel processor for partial match retrieval using superimposed codes", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "218--227", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ruggiero:1980:MBV, author = "M. D. Ruggiero and S. G. Zaky", title = "A microprocessor-based virtual memory system", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "228--235", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jagannathan:1980:TAI, author = "Anand Jagannathan", title = "A technique for the architectural implementation of software subsystems", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "236--244", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Berstis:1980:SPD, author = "Viktors Berstis", title = "Security and protection of data in the {IBM System\slash 38}", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "245--252", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hoffmann:1980:HIC, author = "Miguel Garc{\'\i}a Hoffmann", title = "Hardware implementation of communication protocols: a formal approach", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "253--263", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Guillier:1980:ACF, author = "P. Guillier and D. Slosberg", title = "An architecture with comprehensive facilities of inter-process synchronization and communication", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "264--270", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lougheed:1980:CPP, author = "Robert M. Lougheed and David L. McCubbrey", title = "The cytocomputer: a practical pipelined image processor", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "271--277", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Halatsis:1980:ACM, author = "C. Halatsis and A. van Dam and J. Joosten and M. Letheren", title = "Architectural considerations for a microprogrammable emulating engine using bit-slices", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "278--291", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Irwin:1980:OPS, author = "Mary Jane Irwin and Don Heller", title = "Online pipeline systems for recursive numeric computations", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "292--299", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Foster:1980:DSP, author = "M. J. Foster and H. T. Kung", title = "Design of special-purpose {VLSI} chips: Example and opinions", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "300--307", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kumar:1980:SLC, author = "Anshul Kumar and P. C. P. Bhatt", title = "A structured language for {CAD} of digital systems", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "308--316", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hercksen:1980:HMS, author = "Uwe Hercksen and Rainer Klar and Wolfgang Klein{\"o}der", title = "Hardware-measurements of storage access conflicts in the processor array {EGPA(1)}", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "317--324", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tokoro:1980:HLM, author = "Mario Tokoro and Kiichiro Tamaru and Masaaki Mizuno and Masao Hori", title = "A high level multi-lingual multiprocessor {KMP\slash II}", journal = j-COMP-ARCH-NEWS, volume = "8", number = "3", pages = "325--333", year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:54:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Aupperle:1980:RIC, author = "Ken Aupperle", title = "A real innovation in computer architecture", journal = j-COMP-ARCH-NEWS, volume = "8", number = "4", pages = "6--7", month = jun, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:07 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Galloway:1980:AIR, author = "John R. {Galloway, Jr.}", title = "Architectural innovation round: round \#3", journal = j-COMP-ARCH-NEWS, volume = "8", number = "4", pages = "8--10", month = jun, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:07 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sharp:1980:STD, author = "John A. Sharp", title = "Some thoughts on data flow architectures", journal = j-COMP-ARCH-NEWS, volume = "8", number = "4", pages = "11--21", month = jun, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:07 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Payne:1980:VFP, author = "Mary Payne and Dileep Bhandarkar", title = "{VAX} floating point: a solid foundation for numerical computation", journal = j-COMP-ARCH-NEWS, volume = "8", number = "4", pages = "22--33", month = jun, year = "1980", CODEN = "CANED2", DOI = "https://doi.org/10.1145/641845.641849", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Sat Jun 24 12:02:21 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dickman:1980:TR, author = "Lloyd Dickman", title = "Treasurer's report", journal = j-COMP-ARCH-NEWS, volume = "8", number = "4", pages = "37--38", month = jun, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:07 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Staff:1980:CLAa, author = "{Computer Architecture News} staff", title = "Current literature: abstracts of articles of interest\ldots{}", journal = j-COMP-ARCH-NEWS, volume = "8", number = "4", pages = "48--48", month = jun, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:07 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Davies:1980:CAM, author = "Julian Davies", title = "Clock architecture and management", journal = j-COMP-ARCH-NEWS, volume = "8", number = "5", pages = "3--6", month = aug, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:16 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chroust:1980:RMO, author = "G. Chroust and J. R. M{\"u}hlbacher", title = "Rivalling multiprocessor organization: a hardware\slash speed trade-off", journal = j-COMP-ARCH-NEWS, volume = "8", number = "5", pages = "7--10", month = aug, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:16 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stevenson:1980:RPI, author = "David Stevenson", title = "A report on the proposed {IEEE Floating Point Standard (IEEE Task p754)}", journal = j-COMP-ARCH-NEWS, volume = "8", number = "5", pages = "11--12", month = aug, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:16 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rattner:1980:OBC, author = "Justin Rattner and George Cox", title = "Object-based computer architecture", journal = j-COMP-ARCH-NEWS, volume = "8", number = "6", pages = "4--11", month = oct, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Myers:1980:HIC, author = "G. J. Myers and B. R. S. Buckingham", title = "A hardware implementation of capability-based addressing", journal = j-COMP-ARCH-NEWS, volume = "8", number = "6", pages = "12--24", month = oct, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Patterson:1980:CRI, author = "David A. Patterson and David R. Ditzel", title = "The case for the reduced instruction set computer", journal = j-COMP-ARCH-NEWS, volume = "8", number = "6", pages = "25--33", month = oct, year = "1980", CODEN = "CANED2", DOI = "https://doi.org/10.1145/641914.641917", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/risc-v.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Clark:1980:CCR, author = "Douglas W. Clark and William D. Strecker", title = "Comments on {``The Case for the Reduced Instruction Set Computer,''} by {Patterson} and {Ditzel}", journal = j-COMP-ARCH-NEWS, volume = "8", number = "6", pages = "34--38", month = oct, year = "1980", CODEN = "CANED2", DOI = "https://doi.org/10.1145/641914.641918", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/risc-v.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Brakefield:1980:BAT, author = "James C. Brakefield", title = "Is 32 bits of address too much?", journal = j-COMP-ARCH-NEWS, volume = "8", number = "6", pages = "39--40", month = oct, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Brakefield:1980:PB, author = "James C. Brakefield", title = "The peripheral bus", journal = j-COMP-ARCH-NEWS, volume = "8", number = "6", pages = "41--43", month = oct, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mudge:1980:BRR, author = "Trevor Mudge", title = "Book reviews: Review of {{\em The Structure of Computers and Computation, Vol. I\/}} by {David J. Kuck, John Wiley \& and Sons 1978}", journal = j-COMP-ARCH-NEWS, volume = "8", number = "6", pages = "44--45", month = oct, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Staff:1980:CLAb, author = "Computer Architecture News Staff", title = "Current literature: abstracts of articles of interest\ldots{}", journal = j-COMP-ARCH-NEWS, volume = "8", number = "6", pages = "46--46", month = oct, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Reed:1980:WFC, author = "Karl Reed", title = "The way forward in computer architecture research", journal = j-COMP-ARCH-NEWS, volume = "8", number = "7", pages = "3--7", month = oct, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gilmore:1980:SEM, author = "John Gilmore", title = "Suggested enhancements to the {Motorola MC68000}", journal = j-COMP-ARCH-NEWS, volume = "8", number = "7", pages = "8--14", month = oct, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wakerly:1980:PED, author = "John F. Wakerly", title = "{Pascal} extensions for describing computer instruction sets", journal = j-COMP-ARCH-NEWS, volume = "8", number = "7", pages = "15--23", month = oct, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kavi:1980:SA, author = "Krishna M. Kavi", title = "Semantics of an algorithm", journal = j-COMP-ARCH-NEWS, volume = "8", number = "7", pages = "24--26", month = oct, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Treleaven:1980:VMA, author = "Philip C. Treleaven", title = "{VLSI}: machine architecture and very high level languages", journal = j-COMP-ARCH-NEWS, volume = "8", number = "7", pages = "27--38", month = oct, year = "1980", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dickman:1981:SB, author = "Lloyd Dickman", title = "{SIGARCH} business", journal = j-COMP-ARCH-NEWS, volume = "9", number = "1", pages = "7--8", month = feb, year = "1981", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:28 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{DePrycker:1981:NIM, author = "Martin L. {De Prycker}", title = "A new index mode for the {VAX-11}", journal = j-COMP-ARCH-NEWS, volume = "9", number = "2", pages = "10--11", month = apr, year = "1981", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1296940.1296941", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:58:05 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "One advantage of most high level languages over machine languages consists of the availability of concepts which are frequently used by most programmers. One of these concepts is the array mechanism, where the high level language generally provides three operations associated with array manipulations: type-checking, bounds-checking and address calculation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stevenson:1981:PP, author = "David Stevenson", title = "The {Phoenix Project}", journal = j-COMP-ARCH-NEWS, volume = "9", number = "2", pages = "12--15", month = apr, year = "1981", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1296940.1296942", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:58:05 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The Phoenix Project was an exploration of the issues surrounding large scale scientific computing. It was conducted at the Institute for Advanced Computation, NASA-Ames Research Center at Moffett Field, California from 1975 to 1979. The primary results of the project were a sizing of the likely needs of large scale scientific computing during the 1980s, what computing technology could be available to meet those needs, a conceptual design of a processor that could meet those needs, and a programming, language suitable for use by this community on a parallel processor such as the one proposed.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{VanOost:1981:MPS, author = "E. M. J. C. {Van Oost}", title = "Multi-processor system description and simulation using structured multi-programming languages", journal = j-COMP-ARCH-NEWS, volume = "9", number = "2", pages = "16--32", month = apr, year = "1981", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1296940.1296943", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:58:05 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Most of the multi-processor systems designed for real time control demand a high efficiency, compromising the simplicity of the system. If this requirement imposes a hardware implementation of most of the primitives of the system, a complicated hardware will result. In order to retain to some extent the ease of using structured multi-programming languages, e.g. Concurrent Pascal [1], we have used these languages for the description and simulation of the complex hardware, instead of using them for software implementation of parallelism.\par This approach is explained with examples taken from an existing multi-processor system [2] developed at the Brussels Free University (V.U.B.).", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wakerly:1981:BRR, author = "John Wakerly", title = "Book review: Review of {'The Computers that Saved Metropolis, by DC Comics and Radio Shack', July 1980}", journal = j-COMP-ARCH-NEWS, volume = "9", number = "2", pages = "33--34", month = apr, year = "1981", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1296940.1296945", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:58:05 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } %%% TO DO: [04-Sep-2014] Volume 9 number 3: no data yet in ACM Portal database @Article{Arvind:1981:MPD, author = "Arvind and V. Kathail", title = "A Multiple Processor Data Flow Machine that Supports Generalized Procedures", journal = j-COMP-ARCH-NEWS, volume = "9", number = "3", pages = "??--??", year = "1981", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibsource = "ftp://ftp.math.utah.edu/pub/mirrors/ftp.ira.uka.de/bibliography/Compiler/Functional.bib; http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "Proceedings of the 8th Annual Symposium on Computer Architecture.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "functional dataflow", } @Article{Gerrity:1981:PI, author = "G. W. Gerrity", title = "On processes and interrupts", journal = j-COMP-ARCH-NEWS, volume = "9", number = "4", pages = "4--14", month = jun, year = "1981", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:07 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hill:1981:HMS, author = "Dwight D. Hill", title = "A hardware mechanism for supporting range checks", journal = j-COMP-ARCH-NEWS, volume = "9", number = "4", pages = "15--21", month = jun, year = "1981", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:07 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cherniavsky:1981:CMA, author = "Vladimir S. Cherniavsky", title = "The computing memory another distributed computer architecture", journal = j-COMP-ARCH-NEWS, volume = "9", number = "4", pages = "22--24", month = jun, year = "1981", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:07 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thornton:1981:ASC, author = "James E. Thornton", title = "{8th Annual Symposium on Computer Architecture: Heterogeneous Computer Architecture}", journal = j-COMP-ARCH-NEWS, volume = "9", number = "4", pages = "25--33", month = jun, year = "1981", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:07 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Staff:1981:ETP, author = "Computer Architecture News Staff", title = "Errata for two publications", journal = j-COMP-ARCH-NEWS, volume = "9", number = "4", pages = "34--34", month = jun, year = "1981", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:07 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lindsay:1981:CMM, author = "Donald C. Lindsay", title = "Cache memory for microprocessors", journal = j-COMP-ARCH-NEWS, volume = "9", number = "5", pages = "6--13", month = aug, year = "1981", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1296947.1296948", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:16 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A growth path for current microprocessors is suggested which includes bus enhancements and cache memories. The implications are examined, and several differences from the mainframe world are pointed out.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kavi:1981:IAC, author = "Krishna M. Kavi", title = "Innovative architectures and commercial computers: a summary of the panel discussion at {NCC 1981}", journal = j-COMP-ARCH-NEWS, volume = "9", number = "5", pages = "14--16", month = aug, year = "1981", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1296947.1296949", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:16 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The session was held on May 4, 1981 in Chicago at NCC 1981. The panelists were Harvey Cragon, Pat Goldberg, Dave Patterson, Justin Rattner, Dean Earnest and Peter Denning. Krishna Kavi was the moderator. A complete report of the session is available and can be obtained by writing to the Computer Science Department, P. O. Box 44330, U.S.L., Lafayette, LA 70504.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jenevein:1981:EHS, author = "R. M. Jenevein and ?. DeGroot and G. Jack Lipovski", title = "Errata: ``{A} hardware support mechanism for scheduling resources in parallel machine environment'': (from {Proceedings of the 8th Annual Symposium on Computer Architecture}, p. 57)", journal = j-COMP-ARCH-NEWS, volume = "9", number = "5", pages = "17--17", month = aug, year = "1981", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1296947.1296950", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:16 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yuen:1981:EPS, author = "C. K. Yuen", title = "Extending the power of short-wordlength processors by means of context-dependent machine instructions", journal = j-COMP-ARCH-NEWS, volume = "9", number = "6", pages = "9--15", month = oct, year = "1981", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gottlieb:1981:CPP, author = "Allan Gottlieb and Clyde P. Kruskal", title = "Coordinating parallel processors: a partial unification", journal = j-COMP-ARCH-NEWS, volume = "9", number = "6", pages = "16--24", month = oct, year = "1981", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:1981:ESM, author = "Anonymous", title = "Errata: Structured machine design: an ongoing experiment", journal = j-COMP-ARCH-NEWS, volume = "9", number = "6", pages = "25--25", month = oct, year = "1981", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McDowell:1982:PML, author = "Charlie McDowell", title = "Protection at the micromachine level", journal = j-COMP-ARCH-NEWS, volume = "10", number = "1", pages = "4--8", month = jan, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Feustel:1982:PPC, author = "Edward A. Feustel", title = "Protected procedure call on the {PRIME(TM)} machines", journal = j-COMP-ARCH-NEWS, volume = "10", number = "1", pages = "9--22", month = jan, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{El-Halabi:1982:SRD, author = "Hossam El-Halabi and Dharma P. Agrawal", title = "Some remarks on direct execution computers", journal = j-COMP-ARCH-NEWS, volume = "10", number = "1", pages = "23--27", month = jan, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fitzpatrick:1982:RAV, author = "Daniel T. Fitzpatrick and John K. Foderaro and Manolis G. H. Katevenis and Howard A. Landman and David A. Patterson and James B. Peek and Zvi Peshkess and Carlo H. S{\'e}quin and Robert W. Sherburne and Korbin S. {Van Dyke}", title = "A {RISCy} approach to {VLSI}", journal = j-COMP-ARCH-NEWS, volume = "10", number = "1", pages = "28--32", month = jan, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rattner:1982:HSC, author = "Justin Rattner", title = "Hardware\slash software cooperation in the {iAPX-432}", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "1--1", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hennessy:1982:HST, author = "John Hennessy and Norman Jouppi and Forest Baskett and Thomas Gross and John Gill", title = "Hardware\slash software tradeoffs for increased performance", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "2--11", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rymarczyk:1982:CGP, author = "James W. Rymarczyk", title = "Coding guidelines for pipelined processors", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "12--19", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Johnsson:1982:OMP, author = "Richard K. Johnsson and John D. Wick", title = "An overview of the mesa processor architecture", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "20--29", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Berenbaum:1982:OSL, author = "Alan D. Berenbaum and Michael W. Condry and Priscilla M. Lu", title = "The operating system and language support features of the {BELLMACTM-32} microprocessor", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "30--38", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Radin:1982:M, author = "George Radin", title = "The 801 minicomputer", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "39--47", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ditzel:1982:RAF, author = "David R. Ditzel and H. R. McLellan", title = "Register allocation for free: {The C} machine stack cache", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "48--56", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Harbison:1982:AAO, author = "Samuel P. Harbison", title = "An architectural alternative to optimizing compilers", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "57--65", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lampson:1982:FPC, author = "Butler W. Lampson", title = "Fast procedure calls", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "66--76", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jones:1982:SPM, author = "Douglas W. Jones", title = "Systematic protection mechanism design", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "77--80", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Reed:1982:GPM, author = "Karl Reed", title = "On a general property of memory mapping tables", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "81--86", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cook:1982:EIO, author = "Robert P. Cook and Nitin Donde", title = "An experiment to improve operand addressing", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "87--91", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fusaoka:1982:CCH, author = "Akira Fusaoka and Masaharu Hirayama", title = "Compiler chip: a hardware implementation of compiler", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "92--95", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rau:1982:ASE, author = "B. R. Rau and C. D. Glaeser and E. M. Greenawalt", title = "Architectural support for the efficient generation of code for horizontal architectures", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "96--99", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McLear:1982:GCD, author = "R. E. McLear and D. M. Scheibelhut and E. Tammaru", title = "Guidelines for creating a debuggable processor", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "100--106", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wilkes:1982:HSM, author = "M. V. Wilkes", title = "Hardware support for memory protection: {Capability} implementations", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "107--116", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pollack:1982:SAM, author = "Fred J. Pollack and George W. Cox and Dan W. Hammerstrom and Kevin C. Kahn and Konrad K. Lai and Justin R. Rattner", title = "Supporting {Ada} memory management in the {iAPX-432}", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "117--131", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sansonnet:1982:DEL, author = "J. P. Sansonnet and M. Castan and C. Percebois and D. Botella and J. Perez", title = "Direct execution of {Lisp} on a list-directed architecture", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "132--139", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Johnson:1982:SRA, author = "Mark Scott Johnson", title = "Some requirements for architectural support of software debugging", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "140--148", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Middelburg:1982:EPA, author = "C. A. Middelburg", title = "The effect of the {PDP-11} architecture on code generation for chill", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "149--157", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sweet:1982:EAM, author = "Richard E. Sweet and James G. {Sandman, Jr.}", title = "Empirical analysis of the mesa instruction set", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "158--166", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McDaniel:1982:AMI, author = "Gene McDaniel", title = "An analysis of a mesa instruction set using dynamic instruction frequencies", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "167--176", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wiecek:1982:CSV, author = "Cheryl A. Wiecek", title = "A case study of {VAX-11} instruction set usage for compiler execution", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "177--184", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Maekawa:1982:FSA, author = "Mamoru Maekawa and Ken Sakamura and Chiaki Ishikawa", title = "Firmware structure and architectural support for monitors, vertical migration and user microprogramming", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "185--194", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kamibayashi:1982:HOS, author = "N. Kamibayashi and H. Ogawana and K. Nagayama and H. Aiso", title = "{Heart}: an operating system nucleus machine implemented by firmware", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "195--204", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ahuja:1982:MMA, author = "Sudhir R. Ahuja and Abhaya Asthana", title = "A multi-microprocessor architecture with hardware support for communication and scheduling", journal = j-COMP-ARCH-NEWS, volume = "10", number = "2", pages = "205--209", month = mar, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:44 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Patterson:1982:RAH, author = "David A. Patterson and Richard S. Piepho", title = "{RISC} assessment: a high-level language experiment", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "3--8", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Clark:1982:MAI, author = "Douglas W. Clark and Henry M. Levy", title = "Measurement and analysis of instruction use in the {VAX-11\slash 780}", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "9--17", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kavi:1982:HAP, author = "Krishna Kavi and Boumediene Belkhouche and Evelyn Bullard and Lois Delcambre and Stephen Nemecek", title = "{HLL} architectures: {Pitfalls} and predilections", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "18--23", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gottlieb:1982:NUD, author = "Allan Gottlieb and Ralph Grishman and Clyde P. Kruskal and Kevin P. McAuliffe and Larry Rudolph and Marc Snir", title = "The {NYU Ultracomputer}---designing a {MIMD}, shared-memory parallel machine (extended abstract)", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "27--42", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chu:1982:VAH, author = "King-Hang Chu and King-Sun Fu", title = "{VLSI} architectures for high speed recognition of context-free languages and finite-state languages", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "43--49", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Franklin:1982:ACC, author = "Mark A. Franklin and Donald F. Wann", title = "Asynchronous and clocked control structures for {VLSI} based interconnection networks", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "50--59", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McMillen:1982:PFT, author = "Robert J. McMillen and Howard Jay Siegel", title = "Performance and fault tolerance improvements in the {Inverse Augmented Data Manipulator} network", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "63--72", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Parker:1982:GNM, author = "D. S. Parker and C. S. Raghavendra", title = "The {Gamma} network: a multiprocessor interconnection network with redundant paths", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "73--80", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jenevein:1982:CPR, author = "R. M. Jenevein and J. C. Browne", title = "A control processor for a reconfigurable array computer", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "81--89", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bhuyan:1982:GCP, author = "Laxmi N. Bhuyan and Dharma P. Agrawal", title = "A general class of processor interconnection strategies", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "90--98", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Burkowski:1982:ISD, author = "F. J. Burkowski", title = "Instruction set design issues relating to a static dataflow computer", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "101--111", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smith:1982:DAE, author = "James E. Smith", title = "Decoupled access\slash execute computer architectures", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "112--119", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Caluwaerts:1982:DFA, author = "L. J. Caluwaerts and J. Debacker and J. A. Peperstraete", title = "A data flow architecture with a paged memory system", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "120--127", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rau:1982:ECG, author = "B. Ramakrishna Rau and Christopher D. Glaeser and Raymond L. Picard", title = "Efficient code generation for horizontal architectures: {Compiler} techniques and architectural support", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "131--139", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Barton:1982:SNH, author = "Gene C. Barton", title = "{Sentry}: a novel hardware implementation of classic operating system mechanisms", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "140--147", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Abramovici:1982:LSM, author = "M. Abramovici and Y. H. Levendel and P. R. Menon", title = "A logic simulation machine", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "148--157", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dasgupta:1982:TFL, author = "Subrata Dasgupta and Marius Olafsson", title = "Towards a family of languages for the design and implementation of machine architectures", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "158--167", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:1982:RPD, author = "Yann-Hang Lee and Kang G. Shin", title = "Rollback propagation detection and performance evaluation of {FTMR2M}---a fault-tolerant multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "171--180", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lin:1982:DFT, author = "Woei Lin and Chuan-lin Wu", title = "Design of a $ 2 \times 2 $ fault-tolerant switching element", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "181--189", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fussell:1982:FTW, author = "Donald Fussell and Peter Varman", title = "Fault-tolerant wafer-scale architectures for {VLSI}", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "190--198", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pramanik:1982:DF, author = "Sakti Pramanik", title = "Database filters", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "201--210", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tokoro:1982:SSI, author = "Mario Tokoro and Takashi Takizuka", title = "On the semantic structure of information --- a proposal of the abstract storage architecture", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "211--217", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dohi:1982:HSA, author = "Yasunori Dohi and Akira Suzuki and Noriyuki Matsui", title = "Hardware sorter and its application to data base machine", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "218--225", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Treleaven:1982:RCA, author = "Philip C. Treleaven and Richard P. Hopkins", title = "A recursive computer architecture for {VLSI}", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "229--238", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Castan:1982:HRP, author = "M. Castan and E. I. Organick", title = "{$ \mu $3L}: an {HLL-RISC} processor for parallel execution of {FP}-language programs", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "239--247", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hommes:1982:HSC, author = "F. Hommes", title = "The heap\slash substitution concept --- an implementation of functional operations on data structures for a reduction machine", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "248--256", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Reynolds:1982:SRA, author = "Paul F. {Reynolds, Jr.}", title = "A shared resource algorithm for distributed simulation", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "259--266", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jain:1982:DPT, author = "Bijendra N. Jain", title = "Duplication of packets and their detection in {X.25} communication protocols", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "267--273", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Markenscoff:1982:MPS, author = "Pauline Markenscoff", title = "A multiple processor system for real time control tasks", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "274--280", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Miller:1982:HMD, author = "Leslie Jill Miller", title = "A heterogeneous multiprocessor design and the distributed scheduling of its task group workload", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "283--290", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Goble:1982:DPV, author = "George H. Goble and Michael H. Marsh", title = "A dual processor {VAX 11\slash 780}", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "291--298", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dubois:1982:ECC, author = "Michel Dubois and Fay{\.e} A. Briggs", title = "Effects of cache coherency in multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "299--308", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mudge:1982:PAC, author = "T. N. Mudge and B. A. Makrucki", title = "Probabilistic analysis of a crossbar switch", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "311--320", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Levitan:1982:FEN, author = "Steven P. Levitan and Caxton C. Foster", title = "Finding an extremum in a network", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "321--325", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Premkumar:1982:RAR, author = "U. V. Premkumar and J. C. Browne", title = "Resource allocation in rectangular {SW} banyans", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "326--333", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:1982:LA, author = "Anonymous", title = "List of authors", journal = j-COMP-ARCH-NEWS, volume = "10", number = "3", pages = "335--335", month = apr, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mayer:1982:ABB, author = "Alastair J. W. Mayer", title = "The architecture of the {Burroughs B5000}: 20 years later and still ahead of the times?", journal = j-COMP-ARCH-NEWS, volume = "10", number = "4", pages = "3--10", month = jun, year = "1982", CODEN = "CANED2", DOI = "https://doi.org/10.1145/641542.641543", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:07 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Brakefield:1982:OSA, author = "James C. Brakefield", title = "From the other side of the {Atlantic}: how to improve upon the {MU5} design", journal = j-COMP-ARCH-NEWS, volume = "10", number = "4", pages = "11--16", month = jun, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:07 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hansen:1982:PEI, author = "Paul M. Hansen and Mark A. Linton and Robert N. Mayo and Marguerite Murphy and David A. Patterson", title = "A performance evaluation of the {Intel iAPX 432}", journal = j-COMP-ARCH-NEWS, volume = "10", number = "4", pages = "17--26", month = jun, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:07 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Huguet:1982:PPS, author = "Miquel Huguet", title = "The protection of the processor status word of the {PDP-11\slash 60}", journal = j-COMP-ARCH-NEWS, volume = "10", number = "4", pages = "27--30", month = jun, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:07 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Brakefield:1982:JWO, author = "James Brakefield", title = "Just what is an op-code?: or a universal computer design", journal = j-COMP-ARCH-NEWS, volume = "10", number = "4", pages = "31--34", month = jun, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:07 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Knott:1982:FDA, author = "J. D. Knott and T. W. Crockett", title = "Fair dynamic arbitration for a multiprocessor communications bus", journal = j-COMP-ARCH-NEWS, volume = "10", number = "5", pages = "4--9", month = sep, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Larus:1982:CMA, author = "James R. Larus", title = "A comparison of microcode, assembly code, and high-level languages on the {VAX-11} and {RISC I}", journal = j-COMP-ARCH-NEWS, volume = "10", number = "5", pages = "10--15", month = sep, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Patterson:1982:PEI, author = "David A. Patterson", title = "A performance evaluation of the {Intel 80286}", journal = j-COMP-ARCH-NEWS, volume = "10", number = "5", pages = "16--18", month = sep, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Egan:1982:EVC, author = "Rod Egan", title = "The effect of {VLSI} on computer architecture", journal = j-COMP-ARCH-NEWS, volume = "10", number = "5", pages = "19--22", month = sep, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Benzie:1982:BRR, author = "Thomas Benzie", title = "Book reviews: Review of {{\em Microcomputer Architecture and Programming\/}} by {John F. Wakerly, John Wiley \& Sons, Inc., 1981}", journal = j-COMP-ARCH-NEWS, volume = "10", number = "5", pages = "23--23", month = sep, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Levy:1982:UBM, author = "Henry M. Levy and Douglas W. Clark", title = "On the use of benchmarks for measuring system performance", journal = j-COMP-ARCH-NEWS, volume = "10", number = "6", pages = "5--8", month = dec, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Schulthess:1982:ONA, author = "Peter Schulthess and Fritz Vonaesch", title = "{OPA}: a new architecture for {Pascal-like} languages", journal = j-COMP-ARCH-NEWS, volume = "10", number = "6", pages = "9--20", month = dec, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Brakefield:1982:TI, author = "James C. Brakefield", title = "Talk on interpreters", journal = j-COMP-ARCH-NEWS, volume = "10", number = "6", pages = "21--28", month = dec, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Doran:1982:MFC, author = "D. W. Doran", title = "Main frame computer trends", journal = j-COMP-ARCH-NEWS, volume = "10", number = "6", pages = "29--44", month = dec, year = "1982", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gajski:1983:CLS, author = "Daniel Gajski and David Kuck and Duncan Lawrie and Ahmed Sameh", title = "{CEDAR}: a large scale multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "11", number = "1", pages = "7--11", month = mar, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{French:1983:TDF, author = "Elaine French and Hugh Glaser", title = "{TUKI}: a data flow processor", journal = j-COMP-ARCH-NEWS, volume = "11", number = "1", pages = "12--18", month = mar, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Marovac:1983:SAD, author = "Nenad Marovac", title = "A systematic approach to the design and implementation of a computer instruction set", journal = j-COMP-ARCH-NEWS, volume = "11", number = "1", pages = "19--24", month = mar, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cragon:1983:EIS, author = "Harvey Cragon", title = "Executable instruction set specification", journal = j-COMP-ARCH-NEWS, volume = "11", number = "1", pages = "25--43", month = mar, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Colwell:1983:PTR, author = "Robert P. Colwell and Charles Y. Hitchcock and E. Douglas Jensen", title = "Peering through the {RISC\slash CISC} fog: an outline of research", journal = j-COMP-ARCH-NEWS, volume = "11", number = "1", pages = "44--50", month = mar, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gorsline:1983:RAC, author = "G. W. Gorsline", title = "Review of {{\em Advances in Computer Architecture\/}} by {Glenford J. Myers, John Wiley \& Sons, Inc. 1982}", journal = j-COMP-ARCH-NEWS, volume = "11", number = "1", pages = "55--55", month = mar, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sachs:1983:BRR, author = "M. W. Sachs", title = "Book reviews: Review of {{\em Microcomputer Interfacing\/}} by {G. Jack Lipovski, Lexington Books 1980}", journal = j-COMP-ARCH-NEWS, volume = "11", number = "1", pages = "55--55", month = mar, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Abramson:1983:HSP, author = "David Abramson and John Rosenberg", title = "Hardware support for program debuggers in a paged virtual memory", journal = j-COMP-ARCH-NEWS, volume = "11", number = "2", pages = "8--19", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:42 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Frailey:1983:WLC, author = "Dennis J. Frailey", title = "Word length of a computer architecture definitions and applications", journal = j-COMP-ARCH-NEWS, volume = "11", number = "2", pages = "20--26", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:42 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hollaar:1983:BRR, author = "Lee A. Hollaar", title = "Book reviews: Review of {{\em Computer Design\/}} by {Glen G. Langdon, Computeach Press}", journal = j-COMP-ARCH-NEWS, volume = "11", number = "2", pages = "27--28", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:42 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wilkes:1983:SPS, author = "Maurice V. Wilkes", title = "Size, power, and speed (keynote address)", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "2--4", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Giloi:1983:TTC, author = "W. K. Giloi", title = "Towards a taxonomy of computer architecture based on the machine data type view", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "6--15", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Avizienis:1983:FTF, author = "Algirdas Avi{\v{z}}ienis", title = "Framework for a taxonomy of fault-tolerance attributes in computer systems", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "16--21", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pehrson:1983:CID, author = "Bj{\"o}rn Pehrson and Joachim Parrow", title = "Caddie an interactive design environment", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "24--31", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dasgupta:1983:VCA, author = "Subrata Dasgupta", title = "On the verification of computer architectures using an architecture description language", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "32--38", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{King:1983:RSC, author = "Richard M. King", title = "Research on synthesis of concurrent computing systems (extended abstract)", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "39--46", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fisher:1983:APP, author = "Allan L. Fisher and H. T. Kung and Louis M. Monier and Yasunori Dohi", title = "Architecture of the {PSC}---a programmable systolic chip", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "48--53", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fisher:1983:SLV, author = "Allan L. Fisher and H. T. Kung", title = "Synchronizing large {VLSI} processor arrays", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "54--58", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wagner:1983:BVM, author = "Robert A. Wagner", title = "The {Boolean Vector Machine [BVM]}", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "59--66", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bonuccelli:1983:VTM, author = "M. A. Bonuccelli and E. Lodi and F. Luccio and P. Maestrini and L. Pagli", title = "A {VLSI} tree machine for relational data bases", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "67--73", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Caluwaerts:1983:ISD, author = "L. J. Caluwaerts and J. Debacker and J. A. Peperstraete", title = "Implementing streams on a data flow computer system with paged memory", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "76--83", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Requa:1983:PDF, author = "Joseph E. Requa", title = "The {Piecewise Data Flow} architecture control flow and register management", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "84--89", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tokoro:1983:WSC, author = "Mario Tokoro and J. R. Jagannathan and Hideki Sunahara", title = "On the working set concept for data-flow machines", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "90--97", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Marczynski:1983:DDS, author = "R. W. Marczy{\'n}ski and J. Milewski", title = "A data driven system based on a microprogrammed processor module", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "98--106", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Patterson:1983:AVI, author = "David A. Patterson and Phil Garrison and Mark Hill and Dimitris Lioupis and Chris Nyberg and Tim Sippel and Korbin {Van Dyke}", title = "Architecture of a {VLSI} instruction cache for a {RISC}", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "108--116", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yeh:1983:PSC, author = "Phil C. C. Yeh and Janak H. Patel and Edward S. Davidson", title = "Performance of shared cache for parallel-pipelined computer systems", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "117--123", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Goodman:1983:UCM, author = "James R. Goodman", title = "Using cache memory to reduce processor-memory traffic", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "124--131", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smith:1983:SIC, author = "James E. Smith and James R. Goodman", title = "A study of instruction cache organizations and replacement policies", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "132--137", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fisher:1983:VLI, author = "Joseph A. Fisher", title = "{Very Long Instruction Word} architectures and the {ELI-512}", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "140--150", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tomita:1983:UML, author = "Shinji Tomita and Kiyoshi Shibayama and Toshiaki Kitamura and Toshiyuki Nakata and Hiroshi Hagiwara", title = "A user-microprogrammable, local host computer with low-level parallelism", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "151--157", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gumpertz:1983:CTE, author = "Richard H. Gumpertz", title = "Combining tags with error codes", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "160--165", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Park:1983:FDB, author = "Young Gil Park and Jung Wan Cho", title = "Fault diagnosis of bit-slice processor", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "166--172", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fiol:1983:LDI, author = "M. A. Fiol and I. Alegre and J. L. A. Yebra", title = "Line digraph iterations and the (d,k) problem for directed graphs", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "174--177", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Opper:1983:RAR, author = "Eli Opper and Miroslaw Malek and G. Jack Lipovski", title = "Resource allocation in rectangular {CC}-banyans", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "178--184", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sovis:1983:UTS, author = "Franti{\v{s}}ek Sovi{\v{s}}", title = "Uniform theory of the shuffle-exchange type permutation networks", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "185--191", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Srini:1983:ACA, author = "Vason P. Srini and Jorge F. Asenjo", title = "Analysis of {Cray-1S} architecture", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "194--206", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jordan:1983:PMH, author = "Harry F. Jordan", title = "Performance measurements on {HEP} --- a pipelined {MIMD} computer", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "207--212", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Amano:1983:SSM, author = "Hideharu Amano and Takaichi Yoshida and Hideo Aiso", title = "{(SM)2-Sparse Matrix Solving Machine}", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "213--220", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Krishnan:1983:ESC, author = "R. Kalyana Krishnan and A. K. Rajasekar and C. S. Moghe", title = "An experimental system for {Computer Science} instruction", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "222--227", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kronlof:1983:ECM, author = "Klaus Kronl{\"o}f", title = "Execution control and memory management of a {Data Flow Signal Processor}", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "230--235", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kishi:1983:DDD, author = "Masasuke Kishi and Hiroshi Yasuhara and Yasusuke Kawamura", title = "{DDDP}---a {Distributed Data Driven Processor}", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "236--242", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Takahashi:1983:DFP, author = "Naohisa Takahashi and Makoto Amamiya", title = "A data flow processor array system: {Design} and analysis", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "243--250", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pier:1983:RDH, author = "Kenneth A. Pier", title = "A retrospective on the {Dorado}, a high-performance personal computer", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "252--269", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dugan:1983:SEA, author = "Robert J. Dugan", title = "{System\slash 370} extended architecture: a program view of the channel subsystem", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "270--276", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Norton:1983:AIM, author = "Richard L. Norton and Jacob A. Abraham", title = "Adaptive interpretation as a means of exploiting complex instruction sets", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "277--282", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kumar:1983:SSC, author = "Manoj Kumar and Daniel M. Dias and J. R. Jump", title = "Switching strategies in a class of packet switching networks", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "284--300", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wah:1983:CSD, author = "Benjamin W. Wah", title = "A comparative study of distributed resource sharing on multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "301--308", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fuchs:1983:CED, author = "W. Kent Fuchs and Jacob A. Abraham and Kuang-Hua Huang", title = "Concurrent error detection in {VLSI} interconnection networks", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "309--315", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Giloi:1983:HFD, author = "W. K. Giloi and P. Behr", title = "Hierarchical function distribution --- a design principle for advanced multicomputer architectures", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "318--325", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stringa:1983:EIE, author = "Luigi Stringa", title = "{EMMA}-an industrial experience on large multiprocessing architectures", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "326--333", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Philipson:1983:CSM, author = "Lars Philipson and Bo Nilsson and Bjorn Breidegard", title = "A communication structure for a multiprocessor computer with distributed global memory", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "334--340", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hayashi:1983:AHP, author = "Hiromu Hayashi and Akira Hattori and Haruo Akimoto", title = "{ALPHA}---a high-performance {LISP} machine equipped with a new stack structure and garbage collection system", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "342--348", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Umeyama:1983:PEM, author = "Shinji Umeyama and Koichiro Tamura", title = "A parallel execution model of logic programs", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "349--355", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Schmittgen:1983:SAC, author = "Claudia Schmittgen and Werner Kluge", title = "A system architecture for the concurrent evaluation of applicative program expressions", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "356--362", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yamaguchi:1983:PEL, author = "Yoshinori Yamaguchi and Kenji Toda and Toshitsugu Yuba", title = "A performance evaluation of a {Lisp}-based data-driven machine {(EM-3)}", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "363--369", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tanimoto:1983:PAP, author = "Steven L. Tanimoto", title = "A pyramidal approach to parallel processing", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "372--378", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gaillat:1983:DPP, author = "G{\'e}rard Gaillat", title = "The design of a parallel processor for image processing on-board satellites: an application oriented approach", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "379--386", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nishimura:1983:LPP, author = "Hitoshi Nishimura and Hiroshi Ohno and Toru Kawata and Isao Shirakawa and Koichi Omura", title = "{Links-1} --- a parallel pipelined multimicrocomputer system for image creation", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "387--394", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ericsson:1983:LSM, author = "T. Ericsson and P. E. Danielsson", title = "{LIPP} --- a {SIMD} multiprocessor architecture for image processing", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "395--400", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Treleaven:1983:NGC, author = "Philip C. Treleaven", title = "The new generation of computer architecture", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "402--409", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Uchida:1983:IMS, author = "Shunichi Uchida", title = "Inference machine: {From} sequential to parallel", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "410--416", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Moto-oka:1983:OFG, author = "Tohru Moto-oka", title = "Overview to the {Fifth Generation Computer System} project", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "417--422", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Murakami:1983:RDB, author = "Kunio Murakami and Takeo Kakuta and Nobuyoshi Miyazaki and Shigeki Shibayama and Haruo Yokota", title = "A relational data base machine: {First} step to knowledge base machine", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "423--425", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Arvind:1983:CMN, author = "Arvind and Robert A. Iannucci", title = "A critique of multiprocessing {von Neumann} style", journal = j-COMP-ARCH-NEWS, volume = "11", number = "3", pages = "426--436", month = jun, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hill:1983:ACM, author = "Dwight D. Hill", title = "An analysis of {C} machine support for other block-structured languages", journal = j-COMP-ARCH-NEWS, volume = "11", number = "4", pages = "6--16", month = sep, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:10 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Marovac:1983:IID, author = "Nenad Marovac", title = "On interprocess interaction in distributed architectures", journal = j-COMP-ARCH-NEWS, volume = "11", number = "4", pages = "17--22", month = sep, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:10 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Schalkoff:1983:TED, author = "Robert J. Schalkoff", title = "Towards an efficient, dedicated architecture for a {Digital Geometric Image Transformer (DGIT)}", journal = j-COMP-ARCH-NEWS, volume = "11", number = "4", pages = "23--29", month = sep, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:10 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Plotkin:1983:TSA, author = "Arieh Plotkin and Daniel Tabak", title = "A {Tree Structured Architecture} for semantic gap reduction", journal = j-COMP-ARCH-NEWS, volume = "11", number = "4", pages = "30--44", month = sep, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:10 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wilkes:1983:KJI, author = "Maurice V. Wilkes", title = "Keeping jump instructions out of the pipeline of a {RISC}-like computer", journal = j-COMP-ARCH-NEWS, volume = "11", number = "5", pages = "5--7", month = dec, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:17 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jones:1983:PM, author = "Jeremy Jones", title = "Puzzling with microcode", journal = j-COMP-ARCH-NEWS, volume = "11", number = "5", pages = "8--12", month = dec, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:17 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Amsbury:1983:CSA, author = "Wayne Amsbury", title = "A code-splitting algorithm", journal = j-COMP-ARCH-NEWS, volume = "11", number = "5", pages = "13--21", month = dec, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:17 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dongarra:1983:PVC, author = "Jack J. Dongarra", title = "Performance of various computers using standard linear equations software in a {Fortran} environment", journal = j-COMP-ARCH-NEWS, volume = "11", number = "5", pages = "22--27", month = dec, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:17 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bhujade:1983:DAC, author = "M. R. Bhujade", title = "On the design of {Always Compatible Instruction Set Architecture(ACISA)}", journal = j-COMP-ARCH-NEWS, volume = "11", number = "5", pages = "28--30", month = dec, year = "1983", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:17 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Heath:1984:RER, author = "J. L. Heath", title = "Re-evaluation of the {RISC I}", journal = j-COMP-ARCH-NEWS, volume = "12", number = "1", pages = "3--10", month = mar, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Patterson:1984:RW, author = "David A. Patterson", title = "{RISC} watch", journal = j-COMP-ARCH-NEWS, volume = "12", number = "1", pages = "11--19", month = mar, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Beeler:1984:BBB, author = "Michael Beeler", title = "Beyond the {Baskett} benchmark", journal = j-COMP-ARCH-NEWS, volume = "12", number = "1", pages = "20--31", month = mar, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Feustel:1984:PEP, author = "Edward A. Feustel", title = "Process exchange on the {PR1ME} family of computers", journal = j-COMP-ARCH-NEWS, volume = "12", number = "1", pages = "32--43", month = mar, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fenwick:1984:AOA, author = "P. M. Fenwick", title = "Addressing operations for automatic data structure accessing", journal = j-COMP-ARCH-NEWS, volume = "12", number = "1", pages = "44--57", month = mar, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yuen:1984:SAI, author = "C. K. Yuen", title = "Some applications of the implicit register reference", journal = j-COMP-ARCH-NEWS, volume = "12", number = "1", pages = "58--63", month = mar, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kavi:1984:AQ, author = "Krishna M. Kavi and K. Krishnamohan", title = "Architecture quality", journal = j-COMP-ARCH-NEWS, volume = "12", number = "1", pages = "64--72", month = mar, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Agrawal:1984:BHH, author = "Dharma P. Agrawal and Winser E. Alexander", title = "{B-HIVE}: a heterogeneous, interconnected, versatile and expandable multicomputer system", journal = j-COMP-ARCH-NEWS, volume = "12", number = "2", pages = "7--13", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Burkowski:1984:VAM, author = "F. J. Burkowski", title = "A vector and array multiprocessor extension of the sylvan architecture", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "4--11", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kapauan:1984:PPC, author = "Alejandro Kapauan and J. Timothy Field and Dennis B. Gannon and Lawrence Snyder", title = "The {Pringle} parallel computer", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "12--20", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yasrebi:1984:SAS, author = "Mehrad Yasrebi and G. J. Lipovski", title = "A state-of-the-art {SIMD} two-dimensional {FFT} array processor", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "21--27", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ma:1984:ARS, author = "Y. W. Ma and R. Krishnamurti", title = "The architecture of {Replica}: a special-purpose computer system for active multi-sensory perception of $3$-dimensional objects", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "30--37", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Goldwasser:1984:GOD, author = "Samuel M. Goldwasser", title = "A generalized object display processor architecture", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "38--47", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kawakami:1984:SPL, author = "Katsura Kawakami and Shigeo Shimazaki", title = "A special purpose {LSI} processor using the {DDA} algorithm for image transformation", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "48--54", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wah:1984:SMM, author = "Benjamin W. Wah and Guo-Jie Li and Chee-Fen Yu", title = "The status of {MANIP} --- a multicomputer architecture for solving, combinatorial extremum-search problems", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "56--63", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gonzalez-Rubio:1984:SFP, author = "R. Gonzalez-Rubio and J. Rohmer and D. Terral", title = "The {SCHUSS} filter: a processor for non-numerical data processing", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "64--73", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ebeling:1984:DIV, author = "Carl Ebeling and Andrew Palay", title = "The design and implementation of a {VLSI} chess move generator", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "74--80", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:1984:PAC, author = "Manjai Lee and Chuan-lin Wu", title = "Performance analysis of circuit switching, baseline interconnection networks", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "82--90", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kruskal:1984:IBS, author = "Clyde P. Kruskal and Marc Snir", title = "The importance of being square", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "91--98", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chin:1984:CPM, author = "Chi-Yuan Chin and Kai Hwang", title = "Connection principles for multipath, packet switching networks", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "99--108", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Weiss:1984:IIL, author = "Shlomo Weiss and James E. Smith", title = "Instruction issue logic for pipelined supercomputers", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "110--118", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wedig:1984:RBI, author = "Robert G. Wedig and Marc A. Rose", title = "The reduction of branch instruction execution overhead using structured control flow", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "119--125", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Banerjee:1984:FEL, author = "Utpal Banerjee and Daniel D. Gajski", title = "Fast execution of loops with if statements", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "126--132", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gajski:1984:PPR, author = "Daniel Gajski and Won Kim and Shinya Fushimi", title = "A parallel pipelined relational query processor: an architectural overview", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "134--141", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Somani:1984:EVD, author = "Arun K. Somani and Vinod K. Agarwal", title = "An efficient {VLSI} dictionary machine", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "142--150", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fisher:1984:DMS, author = "Allan L. Fisher", title = "Dictionary machines with a small number of processors", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "151--156", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hill:1984:EEC, author = "Mark D. Hill and Alan Jay Smith", title = "Experimental evaluation of on-chip microprocessor cache memories", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "158--166", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Goodman:1984:USC, author = "James R. Goodman and Men-chow Chiang", title = "The use of static column {RAM} as a memory hierarchy", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "167--173", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Haikala:1984:CHRa, author = "I. J. Haikala", title = "Cache hit ratios with geometric task switch intervals", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "175--175", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ishikawa:1984:DOO, author = "Yutaka Ishikawa and Mario Tokoro", title = "The design of an object oriented architecture", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "178--187", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ungar:1984:ASS, author = "David Ungar and Ricki Blau and Peter Foley and Dain Samples and David Patterson", title = "Architecture of {SOAR}: {Smalltalk} on a {RISC}", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "188--197", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bose:1984:DIS, author = "Pradip Bose and Edward S. Davidson", title = "Design of instruction set architectures for support of high-level languages", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "198--206", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Quinton:1984:ASS, author = "Patrice Quinton", title = "Automatic synthesis of systolic arrays from uniform recurrent equations", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "208--214", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhang:1984:MDS, author = "Chang nian Zhang and David Y. Y. Yun", title = "Multi-dimensional systolic networks, for {Discrete Fourier Transform}", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "215--222", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fortes:1984:DBL, author = "J. A. B. Fortes and D. I. Moldovan", title = "Data broadcasting in linearly scheduled array processors", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "224--231", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ramakrishnan:1984:MMM, author = "I. V. Ramakrishnan and P. J. Varman", title = "Modular matrix multiplication on a linear array", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "232--238", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rao:1984:JEE, author = "T. R. N. Rao", title = "Joint encryption and error correction schemes", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "240--241", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bose:1984:UEC, author = "Bella Bose", title = "Unidirectional error correction\slash detection for {VLSI} memory", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "242--244", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:1984:ECC, author = "C. L. Chen", title = "Error-correcting codes for semiconductor memories", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "245--247", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ghaffar:1984:SEC, author = "Khaled Abdel Ghaffar and Robert J. McEliece", title = "Soft error correction for increased densities in {VLSI} memories", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "248--250", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{King:1984:CSA, author = "Richard M. King and Robert A. Wagner", title = "Combining speed with alpha-particle induced memory, error tolerance in a large {Boolean} vector machine", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "251--253", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bhuyan:1984:PLC, author = "Laxmi N. Bhuyan", title = "On the performance of loosely coupled multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "256--262", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mehrotra:1984:STD, author = "Ravi Mehrotra and Sarosh N. Talukdar", title = "Scheduling of tasks for distributed processors", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "263--270", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kavi:1984:MRD, author = "Krishna M. Kavi and Edward W. Banios and Bruce D. Shriver", title = "Message repository definitional facility: an architectural model for interprocess communication", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "271--278", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Banerjee:1984:FSA, author = "Prithviraj Banerjee and Jacob A. Abraham", title = "Fault-secure algorithms for multiple-processor systems", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "279--287", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bic:1984:ELP, author = "Lubomir Bic", title = "Execution of logic programs on a dataflow architecture", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "290--296", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rudd:1984:HPF, author = "W. G. Rudd and Duncan A. Buell and Donald M. Chiarulli", title = "A high performance factoring machine", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "297--300", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Emer:1984:CPP, author = "Joel S. Emer and Douglas W. Clark", title = "A characterization of processor performance in the {VAX-11\slash 780}", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "301--310", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Moeller:1984:PPP, author = "W. D. Moeller and G. Sandweg", title = "The peripheral processor {PP4}, a highly regular {VLSI} processor", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "312--318", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Philipson:1984:VBD, author = "Lars Philipson", title = "{VLSI} based design principles for {MIMD} multiprocessor computers with distributed memory management", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "319--327", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Samatham:1984:MNS, author = "M. R. Samatham and D. K. Pradhan", title = "A multiprocessor network suitable for single-chip {VLSI} implementation", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "328--339", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rudolph:1984:DDC, author = "Larry Rudolph and Zary Segall", title = "Dynamic decentralized cache schemes for {MIMD} parallel processors", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "340--347", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Papamarcos:1984:LOC, author = "Mark S. Papamarcos and Janak H. Patel", title = "A low-overhead coherence solution for multiprocessors with private cache memories", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "348--354", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Archibald:1984:ESC, author = "James Archibald and Jean Loup Baer", title = "An economical solution to the cache coherence problem", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "355--362", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Haikala:1984:CHRb, author = "Ilkka J. Haikala", title = "Cache hit ratios with geometric task switch intervals", journal = j-COMP-ARCH-NEWS, volume = "12", number = "3", pages = "364--371", month = jun, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chesley:1984:WM, author = "Gilman D. Chesley", title = "A wafer microcomputer", journal = j-COMP-ARCH-NEWS, volume = "12", number = "4", pages = "4--6", month = sep, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:10 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Siegel:1984:PRP, author = "Howard Jay Siegel and Thomas Schwederski and Nathaniel J. {Davis IV} and James T. Kuehn", title = "{PASM}: a reconfigurable parallel system for image processing", journal = j-COMP-ARCH-NEWS, volume = "12", number = "4", pages = "7--19", month = sep, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:10 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Aslam:1984:MDC, author = "Javaid Aslam", title = "Methodology for designing a computer architecture", journal = j-COMP-ARCH-NEWS, volume = "12", number = "5", pages = "4--11", month = dec, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:18 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Graham:1984:PAS, author = "Peter C. J. Graham", title = "Providing architectural support for expert systems", journal = j-COMP-ARCH-NEWS, volume = "12", number = "5", pages = "12--18", month = dec, year = "1984", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:18 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dongarra:1985:PVC, author = "Jack J. Dongarra", title = "Performance of various computers using standard linear equations software in a {Fortran} environment", journal = j-COMP-ARCH-NEWS, volume = "13", number = "1", pages = "3--11", month = mar, year = "1985", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1296930.1296931", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:22 MDT 2008", bibsource = "ftp://ftp.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib; http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This note compares the performance of different computer systems while solving dense systems of linear equations using the LINPACK software in a Fortran environment. About 100 computers, ranging from a CRAY X-MP to the 68000 based systems such as the Apollo and SUN Workstations to IBM PC's, are compared.", acknowledgement = ack-nhfb, classcodes = "C4140 (Linear algebra); C5470 (Performance evaluation and testing); C7310 (Mathematics computing)", corpsource = "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL, USA", fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "68000 based; Apollo workstations; Cray X-MP; dense systems; evaluation; FORTRAN environment; IBM PCs; linear algebra; linear equations; LINPACK; performance; performance comparison; performance evaluation; software; Sun Workstations; systems", treatment = "X Experimental", } @Article{Hor:1985:DPP, author = "T. M. Hor and C. K. Yuen", title = "The design and programming of a powerful short wordlength processor using context-dependent machine instructions", journal = j-COMP-ARCH-NEWS, volume = "13", number = "1", pages = "12--26", month = mar, year = "1985", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1296930.1296932", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:22 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Context-dependent machine instructions were used to extend the capability of instruction set of a short wordlength processor. By freeing instruction bits for other purposes, a more powerful machine instruction set can be designed. Programming examples were given to illustrate the benefit obtained from the design. Less CPU time and memory space were required as compared with popular 8-bit CPUs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Miya:1985:MDP, author = "E. N. Miya", title = "Multiprocessor\slash distributed processing bibliography (in machine-readable form)", journal = j-COMP-ARCH-NEWS, volume = "13", number = "1", pages = "27--29", month = mar, year = "1985", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1296930.1296933", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:22 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "There is a lot of renewed interest in parallel processing. People parallel process, too. Human parallel processing tends to be cooperative rather than competitive. To this end, research literature uses bibliographies like road-maps to the field.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "annotated bibliography; cellular automata; computer system architecture; fault-tolerant computers; multicomputers; multiprocessor software; networks; operating systems; parallel algorithms; parallel processing; programming languages; supercomputers; vector processing", } @Article{Hu:1985:DAE, author = "Weiming Hu", title = "Dataflow architecture for {EEG} patient monitor", journal = j-COMP-ARCH-NEWS, volume = "13", number = "2", pages = "3--10", month = jun, year = "1985", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1296935.1296936", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Much work is currently directed towards dataflow architectures. Most of the proposed architectures attempt to exploit fine grained parallelism. This paper describes an application specific dataflow architecture which exploits coarse grained parallelism. The application is that of a real-time patient monitor used to display patient data.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tagg:1985:SEA, author = "A. G. Tagg", title = "Speculations on the evolution of an architecture", journal = j-COMP-ARCH-NEWS, volume = "13", number = "2", pages = "11--18", month = jun, year = "1985", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1296935.1296937", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "PRIME computers was formed in the early 1970s by a splinter group of hardware and software engineers from Honeywell. With them, they brought their ideas on minicomputers, based on their experience of Honeywell minis, and their experience of the MULTICS operating system.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Randell:1985:HST, author = "Brian Randell", title = "Hardware\slash software tradeoffs: a general design principle?", journal = j-COMP-ARCH-NEWS, volume = "13", number = "2", pages = "19--21", month = jun, year = "1985", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1296935.1296938", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Hardware and software are logically equivalent. Any operation performed by software can also be built directly into the hardware and any instruction executed by the hardware can also be simulated in software. The decision to put certain features in hardware and others in software is based on such factors as cost, speed, reliability and frequency of change. There are no hard and fast rules to the effect that X must go into the hardware and Y must be programmed explicitly. Designers with different goals may, and often do, make different decisions\ldots{} the boundary between hardware and software is arbitrary and constantly changing. Today's software is tomorrow's hardware, and vice versa. [1]", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kumar:1985:APM, author = "V. K. Prasanna Kumar and C. S. Raghavendra", title = "Array processor with multiple broadcasting", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "2--10", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wolf:1985:MMI, author = "G. Wolf and J. R. Jump", title = "Matrix multiplication in an interleaved array processing architecture", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "11--17", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Goodman:1985:PVD, author = "J. R. Goodman and Jian-tu Hsieh and Koujuch Liou and Andrew R. Pleszkun and P. B. Schechter and Honesty C. Young", title = "{PIPE}: a {VLSI} decoupled architecture", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "20--27", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hsu:1985:TST, author = "Peter Y. T. Hsu and Joseph T. Rahmeh and Edward S. Davidson and Jacob A. Abraham", title = "{TIDBITS}: speedup via time-delay bit-slicing in {ALU} design for {VLSI} technology", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "29--35", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smith:1985:IPI, author = "James E. Smith and Andrew R. Pleszkun", title = "Implementation of precise interrupts in pipelined processors", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "36--44", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Schwetman:1985:CPP, author = "Herb Schwetman and Daniel Gajski and Dennis Gannon and Daniel Hills and Jacob Schwartz and James Browne", title = "Classification of parallel processor architectures (invited tutorial session)", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "45--45", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hasegawa:1985:HST, author = "Makoto Hasegawa and Yoshiharu Shigei", title = "High-speed top-of-stack scheme for {VLSI} processor: a management algorithm and its analysis", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "48--54", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hitchcock:1985:AMR, author = "Charles Y. {Hitchcock III} and H. M. Brinkley Sprunt", title = "Analyzing multiple register sets", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "55--63", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smith:1985:CEI, author = "Alan Jay Smith", title = "Cache evaluation and the impact of workload choice", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "64--73", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Moon:1985:AS, author = "David A. Moon", title = "Architecture of the {Symbolics 3600}", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "76--83", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ram:1985:PGC, author = "Ashwin Ram and Janak H. Patel", title = "Parallel garbage collection without synchronization overhead", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "84--90", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sohi:1985:ELE, author = "Gurindar S. Sohi and Edward S. Davidson and Janak H. Patel", title = "An efficient {LISP}-execution architecture with a new representation for list structures", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "91--98", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Amano:1985:SIN, author = "Hideharu Amano and Taisuke Boku and Tomohiro Kudoh and Hideo Aiso", title = "{(SM)2-II}: a new version of the sparse matrix solving machine", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "100--107", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Beetem:1985:GS, author = "John Beetem and Monty Denneau and Don Weingarten", title = "The {GF11} supercomputer", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "108--115", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smith:1985:MUD, author = "Bradley Warren Smith and Howard Jay Siegel", title = "Models for use in the design of macro-pipelined parallel processors", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "116--123", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Edler:1985:IRM, author = "Jan Edler and Allan Gottlieb and Clyde P. Kruskal and Kevin P. McAuliffe and Larry Rudolph and Marc Snir and Patricia J. Teller and James Wilson", title = "Issues related to {MIMD} shared-memory computers: the {NYU Ultracomputer} approach", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "126--135", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ibbett:1985:MPV, author = "R. N. Ibbett and P. C. Capon and N. P. Topham", title = "{MU6V}: a parallel vector processing system", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "136--144", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lundstrom:1985:DCH, author = "Stephen F. Lundstrom", title = "A decentralized control, highly concurrent multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "145--151", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dally:1985:OOA, author = "William J. Dally and James T. Kajiya", title = "An object oriented architecture", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "154--161", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gehringer:1985:TAH, author = "Edward F. Gehringer and J. Leslie Keedy", title = "Tagged architecture: how compelling are its advantages?", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "162--170", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nanba:1985:VAV, author = "S. Nanba and N. Ohno and H. Kubo and H. Morisue and T. Ohshima and H. Yamagishi", title = "{VM\slash 4}: {ACOS-4} virtual machine architecture", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "171--178", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dobry:1985:PSP, author = "T. P. Dobry and A. M. Despain and Y. N. Patt", title = "Performance studies of a {Prolog} machine architecture", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "180--190", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nakazaki:1985:DHS, author = "Ryosei Nakazaki and Akihiko Konagaya and Shin'ichi Habata and Hideo Shimazu and Mamoru Umemutra and Masahiro Yamamoto and Minoru Yokota and Takashi Chikayama", title = "Design of a high-speed {Prolog} machine {(HPM)}", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "191--197", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Woo:1985:HUU, author = "Nam Sung Woo", title = "A hardware unification unit: design and analysis", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "198--205", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Matelan:1985:FM, author = "Nicholas Matelan", title = "The {FLEX\slash 32} multicomputer", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "209--213", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rattner:1985:CMT, author = "J. Rattner", title = "Commercial multiprocessors (title only)", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "214--214", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Naedel:1985:CCA, author = "Dick Naedel", title = "Closely coupled asynchronous hierarchical and parallel processing in an open architecture", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "215--220", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Savage:1985:PPL, author = "Jim Savage", title = "Parallel processing as a language design problem", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "221--224", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rodgers:1985:IMS, author = "David P. Rodgers", title = "Improvements in multiprocessor system design", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "225--231", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mark:1985:SCF, author = "Peter B. Mark", title = "The {Sequoia} computer: a fault-tolerant tightly-coupled multiprocessor architecture", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "232--232", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nestle:1985:SNS, author = "Elliot Nestle and Armond Inselberg", title = "The {SYNAPSE N+1 System}: architectural characteristics and performance data of a tightly-coupled multiprocessor system", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "233--239", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Horst:1985:AHV, author = "Robert W. Horst and Timothy C. K. Chou", title = "An architecture for high volume transaction processing", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "240--245", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stone:1985:FGC, author = "Harold Stone and Eric Manning and Harriet Rigas and Philip Treleaven", title = "The fifth generation computer systems projects (invited session)", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "247--247", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kamiya:1985:HPA, author = "Shigeo Kamiya and Susumu Matsuda and Kazuhide Iwata and Shigeki Shibayama and Hiroshi Sakai and Kunio Murakami", title = "A hardware pipeline algorithm for relational database operation", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "250--257", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:1985:DMR, author = "Dik Lun Lee", title = "A distributed multiple-response resolver for value-order retrieval", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "258--265", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Feo:1985:DDR, author = "John Feo and Roy Jenevein and J. C. Browne", title = "Dynamic, distributed resource configuration on {SW}-banyans", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "268--275", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Katz:1985:ICC, author = "R. H. Katz and S. J. Eggers and D. A. Wood and C. L. Perkins and R. G. Sheldon", title = "Implementing a cache consistency protocol", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "276--283", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Li:1985:TRS, author = "Zhiyuan Li and Walid Abu-Sufah", title = "A technique for reducing synchronization overhead in large scale multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "284--291", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Whitby-Strevens:1985:T, author = "Colin Whitby-Strevens", title = "The transputer", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "292--300", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hurson:1985:SMU, author = "A. R. Hurson and B. Shirazi", title = "A systolic multiplier unit and its {VLSI} design", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "302--309", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Melhem:1985:LSS, author = "Rami Melhem", title = "A language for the simulation of systolic architectures", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "310--314", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chuang:1985:VSA, author = "Henry Y. H. Chuang and Guo He", title = "A versatile systolic array for matrix computations", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "315--322", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vedder:1985:HDF, author = "Rex Vedder and Dennis Finn", title = "The {Hughes Data Flow Multiprocessor}: architecture for efficient signal and data processing", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "324--332", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Traub:1985:APG, author = "Kenneth R. Traub", title = "An abstract parallel graph reduction machine", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "333--341", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Preiss:1985:DFQ, author = "Bruno R. Preiss and V. C. Hamacher", title = "Data flow on a queue machine", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "342--351", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gaudiot:1985:MHS, author = "J. L. Gaudiot", title = "Methods for handling structures in data-flow systems", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "352--358", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Samatham:1985:BMN, author = "M. R. Samatham and D. K. Pradhan", title = "The {de Bruijn} multiprocessor network: a versatile sorting network", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "360--367", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tzeng:1985:FTS, author = "Nian-Feng Tzeng and Pen-Chung Yew and Chun-Qi Zhu", title = "A fault-tolerant scheme for multistage interconnection networks", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "368--375", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kumar:1985:DAF, author = "V. P. Kumar and S. M. Reddy", title = "Design and analysis of fault-tolerant multistage interconnection networks with low link complexity", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "376--386", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Davis:1985:PAP, author = "Nathaniel J. {Davis IV} and Howard Jay Siegel", title = "The performance analysis of partitioned circuit switched multistage interconnection networks", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "387--394", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vrsalovic:1985:IPD, author = "Dalibor Vrsalovic and Edward F. Gehringer and Zary Z. Segall and Daniel P. Siewiorek", title = "The influence of parallel decomposition strategies on the performance of multiprocessor systems", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "396--405", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Abu-Sufah:1985:PPT, author = "Walid Abu-Sufah and Alex Y. Kwok", title = "Performance prediction tools for {Cedar}: a multiprocessor supercomputer", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "406--413", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Grino:1985:ASM, author = "Jos{\'e} M. Llaber{\'\i}a Gri{\~n}{\'o} and Mateo Valero Cort{\'e}s and Enrique Herrada Lillo and Jes{\'u}s Labarta Mancho", title = "Analysis and simulation of multiplexed single-bus networks with and without buffering", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "414--421", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sanguinetti:1985:PMB, author = "J. Sanguinetti and B. Kumar", title = "Performance of a message-based multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "13", number = "3", pages = "424--425", month = jun, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hake:1985:PDP, author = "J.-Fr. Hake", title = "{PDOC} --- a database on parallel processing literature", journal = j-COMP-ARCH-NEWS, volume = "13", number = "4", pages = "2--7", month = sep, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:10 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rockey:1985:DAS, author = "Mark Rockey", title = "The dataflow architecture: a suitable base for the implementation of expert systems", journal = j-COMP-ARCH-NEWS, volume = "13", number = "4", pages = "8--14", month = sep, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:10 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cragon:1985:ADS, author = "Harvey G. Cragon", title = "An architecture design system", journal = j-COMP-ARCH-NEWS, volume = "13", number = "4", pages = "15--21", month = sep, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:10 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Huguet:1985:RRF, author = "Miquel Huguet and Tom{\'a}s Lang", title = "A reduced register file for {RISC} architectures", journal = j-COMP-ARCH-NEWS, volume = "13", number = "4", pages = "22--31", month = sep, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:10 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Alexander:1985:TBP, author = "Cedell A. Alexander and William M. Keshlear and Faye Briggs", title = "Translation buffer performance in a {UNIX} environment", journal = j-COMP-ARCH-NEWS, volume = "13", number = "5", pages = "2--14", month = dec, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:18 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:1985:HSC, author = "Rosanna Lee", title = "On ``hot spot'' contention", journal = j-COMP-ARCH-NEWS, volume = "13", number = "5", pages = "15--20", month = dec, year = "1985", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:18 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Woo:1986:CHU, author = "Nam Sung Woo and Richard O'Keefe", title = "A comment on {``A hardware unification unit: design and analysis''}", journal = j-COMP-ARCH-NEWS, volume = "14", number = "1", pages = "2--3", month = jan, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ruighaver:1986:DAD, author = "A. B. Ruighaver", title = "Design aspects of the {Delft Parallel Processor DPP84} and its programming system", journal = j-COMP-ARCH-NEWS, volume = "14", number = "1", pages = "4--8", month = jan, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hammerstrom:1986:CAP, author = "Dan Hammerstrom and David Maier and Shreekant Thakkar", title = "The {Cognitive Architecture Project}", journal = j-COMP-ARCH-NEWS, volume = "14", number = "1", pages = "9--21", month = jan, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smith:1986:BRC, author = "Alan Jay Smith", title = "Bibliography and reading on {CPU} cache memories and related topics", journal = j-COMP-ARCH-NEWS, volume = "14", number = "1", pages = "22--42", month = jan, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yokota:1986:MAR, author = "H. Yokota and H. Itoh", title = "A model and an architecture for a relational knowledge base", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "2--9", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Amamiya:1986:IEL, author = "M. Amamiya and M. Takesue and R. Hasegawa and H. Mikami", title = "Implementation and evaluation of a list-processing-oriented data flow machine", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "10--19", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Takahashi:1986:NSS, author = "K. Takahashi and H. Yamada and H. Nagai and K. Matsumi", title = "A new string search hardware architecture for {VLSI}", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "20--27", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gupta:1986:PAA, author = "A. Gupta and C. Forgy and A. Newell and R. Wedig", title = "Parallel algorithms and architectures for rule-based systems", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "28--37", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Halstead:1986:CDM, author = "R. R. {Halstead, Jr.} and T. L. Anderson and R. B. Osborne and T. L. Sterling", title = "{Concert}: design of a multiprocessor development system", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "40--48", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kung:1986:MRB, author = "H. T. Kung", title = "Memory requirements for balanced computer architectures", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "49--54", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hong:1986:GAS, author = "Y. C. Hong and T. H. Payne and L. B. O. Ferguson", title = "Graph allocation in static dataflow systems", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "55--64", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Agrawal:1986:SIR, author = "P. Agrawal and R. Agrawal", title = "Software implementation of a recursive fault tolerance algorithm on a network of computers", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "65--72", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nojiri:1986:MPO, author = "T. Nojiri and S. Kawasaki and K. Sakoda", title = "Microprogrammable processor for object-oriented architecture", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "74--81", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thakkar:1986:IFU, author = "S. S. Thakkar and W. E. Hostmann", title = "An instruction fetch unit for a graph reduction machine", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "82--91", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gehringer:1986:FOO, author = "E. F. Gehringer and R. P. Colwell", title = "Fast object-oriented procedure calls: lessons from the {Intel 432}", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "92--101", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dias:1986:CMS, author = "D. M. Dias and B. R. Iyer and P. S. Yu", title = "On coupling many small systems for transaction processing", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "104--110", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Malkawi:1986:PMP, author = "M. I. Malkawi and J. H. Patel", title = "Performance measurement of paging behavior in multiprogramming systems", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "111--118", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Agarwal:1986:ANT, author = "A. Agarwal and R. L. Sites and M. Horowitz", title = "{ATUM}: a new technique for capturing address traces using microcode", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "119--127", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wise:1986:EES, author = "M. J. Wise", title = "Experimenting with {EPILOG}: some results and preliminary conclusions", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "119--127", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shobatake:1986:UPB, author = "Y. Shobatake and H. Aiso", title = "A unification processor based on a uniformly structured cellular hardware", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "128--139", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ito:1986:APE, author = "N. Ito and M. Sato and E. Kuno and K. Rokusawa", title = "The architecture and preliminary evaluation results of the experimental parallel inference machine {PIM-D}", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "149--156", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Seznec:1986:ERC, author = "A. Seznec", title = "An efficient routing control for the {SIGMA} network {$ \Sigma (4) $}", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "158--168", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nicoud:1986:RHP, author = "J. D. Nicoud and K. Skala", title = "{REYSM}, a high performance, low power multi-processor bus", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "169--174", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:1986:ESG, author = "K. Y. Lee and W. Hegazy", title = "The extra stage gamma network", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "175--182", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yuhara:1986:EFA, author = "M. Yuhara and A. Hattori and M. Niwa and M. Kishimoto and H. Hayashi", title = "Evaluation of the {FACOM ALPHA Lisp} machine", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "184--190", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pleszkun:1986:AEL, author = "A. R. Pleszkun and M. J. Thazhuthaveetil", title = "An architecture for efficient {Lisp} list access", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "191--198", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nakata:1986:FLS, author = "T. Nakata and N. Koike", title = "A functional level simulation engine of {MAN-YO}: a special purpose parallel machine for logic design automation", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "202--208", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Frank:1986:EPS, author = "E. H. Frank", title = "Exploiting parallelism in a switch-level simulation machine", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "209--215", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anantharaman:1986:HAS, author = "T. S. Anantharaman and R. Bisiani", title = "A hardware accelerator for speech recognition algorithms", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "216--223", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shimada:1986:EPD, author = "T. Shimada and K. Hiraki and K. Nishida and S. Sekiguchi", title = "Evaluation of a prototype data flow processor of the {SIGMA-1} for scientific computations", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "226--234", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sargeant:1986:SDS, author = "J. Sargeant and C. C. Kirkham", title = "Stored data structures on the {Manchester} dataflow machine", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "235--242", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hawakami:1986:SDS, author = "K. Hawakami and J. R. Gurd", title = "A scalable dataflow structure store", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "243--250", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hasegawa:1986:FFT, author = "M. Hasegawa and Y. Shigei", title = "{$ A T^2 = O(N \log^4 N), T = O(\log N) $} {Fast Fourier Transform} in a light connected $3$-dimensional {VLSI}", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "252--260", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sapiecha:1986:MAH, author = "K. Sapiecha and R. Jarocki", title = "Modular architecture for high performance implementation of {FFT} algorithm", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "261--270", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Navarro:1986:CSI, author = "J. J. Navarro and J. M. Llaberia and M. Valero", title = "Computing size-independent matrix problems on systolic array processors", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "271--278", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tomita:1986:CLL, author = "S. Tomita and K. Shibayama and T. Nakata and S. Yuasa and H. Hagiwara", title = "A computer with low-level parallelism {QA-2}: its applications to {$3$-D} graphics and {Prolog\slash Lisp} machines", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "280--289", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hirayama:1986:VOA, author = "M. Hirayama", title = "{VLSI} oriented asynchronous architecture", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "290--296", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hwu:1986:HHP, author = "W. Hwu and Y. N. Patt", title = "{HPSm}, a high performance restricted data flow architecture having minimal functionality", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "297--306", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Onaga:1986:DRA, author = "K. Onaga and T. Takechi", title = "On design of rotary array communication and wavefront-driven algorithms for solving large-scale band-limited matrix equations", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "308--315", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Napolitano:1986:CAD, author = "L. M. {Napolitano, Jr.}", title = "A computer architecture for dynamic finite element analysis", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "316--323", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Harper:1986:PEV, author = "D. T. {Harper III} and J. R. Jump", title = "Performance evaluation of vector accesses in parallel memories using a skewed storage scheme", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "324--328", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kondo:1986:PMA, author = "T. Kondo and T. Tsuchiya and T. Kitamura and Y. Sugiyama and T. Kimura", title = "Pseudo {MIMD} array processor---{AAP2}", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "330--337", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fisher:1986:SLA, author = "A. L. Fisher", title = "Scan line array processors for image computation", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "338--345", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Annaratone:1986:WAI, author = "M. Annaratone and E. Arnould and T. Gross and H. T. Kung and M. S. Lam", title = "{Warp} architecture and implementation", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "346--356", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wood:1986:CAT, author = "D. A. Wood and S. J. Eggers and G. Gibson and M. D. Hill and J. M. Pendleton", title = "An in-cache address translation mechanism", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "358--365", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cheriton:1986:SCC, author = "D. R. Cheriton and G. A. Slavenburg and P. D. Boyle", title = "Software-controlled caches in the {VMP} multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "366--374", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Goodman:1986:URV, author = "J. R. Goodman and W. C. Hsu", title = "On the use of registers vs. cache to minimize memory traffic", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "375--383", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hsu:1986:HCS, author = "P. Y. T. Hsu and E. S. Davidson", title = "Highly concurrent scalar processing", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "386--395", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McFarling:1986:RCB, author = "S. McFarling and J. Hennesey", title = "Reducing the cost of branches", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "396--403", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kunkel:1986:OPS, author = "S. R. Kunkel and J. E. Smith", title = "Optimal pipelining in supercomputers", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "404--411", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sweazey:1986:CCC, author = "P. Sweazey and A. J. Smith", title = "A class of compatible cache consistency protocols and their support by the {IEEE Futurebus}", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "414--423", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bitar:1986:MCS, author = "P. Bitar and A. M. Despain", title = "Multiprocessor cache synchronization: issues, innovations, evolution", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "424--433", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dubois:1986:MAB, author = "M. Dubois and C. Scheurich and F. Briggs", title = "Memory access buffering in multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "434--442", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Taylor:1986:ESL, author = "G. S. Taylor and P. N. Hilfinger and J. R. Larus and D. A. Patterson and B. G. Zorn", title = "Evaluation of the {SPUR Lisp} architecture", journal = j-COMP-ARCH-NEWS, volume = "14", number = "2", pages = "444--452", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Woo:1986:RCC, author = "Nam Sung Woo", title = "A reply to comments {``A Comment on 'A Hardware Unification Unit: Design and Analysis''\,'}", journal = j-COMP-ARCH-NEWS, volume = "14", number = "3", pages = "2--4", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{DuBose:1986:MR, author = "D. K. DuBose and D. K. Fotakis and D. Tabak", title = "A microcoded {RISC}", journal = j-COMP-ARCH-NEWS, volume = "14", number = "3", pages = "5--16", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lang:1986:RRS, author = "Tom{\'a}s Lang and Miquel Huguet", title = "Reduced register saving\slash restoring in single-window register files", journal = j-COMP-ARCH-NEWS, volume = "14", number = "3", pages = "17--26", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rouse:1986:TDH, author = "Larry O'Neal Rouse", title = "The twisted double helix: a minimum distance architecture for 5th generation computing", journal = j-COMP-ARCH-NEWS, volume = "14", number = "3", pages = "27--33", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Harland:1986:RMT, author = "David M. Harland", title = "A recursively microcodable tagged architecture", journal = j-COMP-ARCH-NEWS, volume = "14", number = "3", pages = "34--40", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Alexander:1986:CMP, author = "Cedell Alexander and William Keshlear and Furrokh Cooper and Faye Briggs", title = "Cache memory performance in a {Unix} environment", journal = j-COMP-ARCH-NEWS, volume = "14", number = "3", pages = "41--61", month = jun, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stokes:1986:THV, author = "Roger Stokes", title = "Traces for hardware verification", journal = j-COMP-ARCH-NEWS, volume = "14", number = "4", pages = "7--14", month = sep, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kirner:1986:DDS, author = "Claudio Kirner and Eduardo Marques", title = "Design of a distributed system support based on a centralized parallel bus", journal = j-COMP-ARCH-NEWS, volume = "14", number = "4", pages = "15--26", month = sep, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Irwin:1986:STR, author = "Mary Jane Irwin", title = "Secretary\slash Treasurer's {Report}", journal = j-COMP-ARCH-NEWS, volume = "14", number = "4", pages = "28--28", month = sep, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Harland:1986:MOO, author = "David M. Harland and Bruno Beloff", title = "Microcoding an object-oriented instruction set", journal = j-COMP-ARCH-NEWS, volume = "14", number = "5", pages = "3--12", month = dec, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:18 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stallings:1986:ABR, author = "William Stallings", title = "An annotated bibliography on reduced instruction set computers", journal = j-COMP-ARCH-NEWS, volume = "14", number = "5", pages = "13--19", month = dec, year = "1986", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:18 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Halstead:1987:OCM, author = "Robert H. {Halstead, Jr.}", title = "Overview of {Concert MultiLisp}: a multiprocessor symbolic computing system", journal = j-COMP-ARCH-NEWS, volume = "15", number = "1", pages = "5--14", month = mar, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Patterson:1987:PRS, author = "Dave Patterson", title = "A progress report on {SPUR}: {February 1, 1987}", journal = j-COMP-ARCH-NEWS, volume = "15", number = "1", pages = "15--21", month = mar, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Despain:1987:A, author = "A. Despain and Y. Patt and V. Srini and P. Bitar and W. Bush and C. Chien and W. Citrin and B. Fagin and W. Hwu and S. Melvin and R. McGeer and A. Singhal and M. Shebanow and P. {Van Roy}", title = "Aquarius", journal = j-COMP-ARCH-NEWS, volume = "15", number = "1", pages = "22--34", month = mar, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kohli:1987:OPP, author = "Madhur Kohli and Mark E. Giuliano and Jack Minker", title = "An overview of the {PRISM} project", journal = j-COMP-ARCH-NEWS, volume = "15", number = "1", pages = "35--42", month = mar, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hermenegildo:1987:DHP, author = "M. V. Hermenegildo and R. A. Warren", title = "Designing a high performance parallel logic programming system", journal = j-COMP-ARCH-NEWS, volume = "15", number = "1", pages = "43--52", month = mar, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mills:1987:CGR, author = "Jonathan W. Mills", title = "Coming to grips with a {RISC}: a report of the progress of the {LOW RISC} design group", journal = j-COMP-ARCH-NEWS, volume = "15", number = "1", pages = "53--62", month = mar, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Short:1987:UIS, author = "Brian Short", title = "Use of instruction set simulators to evaluate the {LOW RISC}", journal = j-COMP-ARCH-NEWS, volume = "15", number = "1", pages = "63--67", month = mar, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gutzmann:1987:ODH, author = "Kurt M. Gutzmann", title = "Optimal dimension of hypercubes for sorting", journal = j-COMP-ARCH-NEWS, volume = "15", number = "1", pages = "68--72", month = mar, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chesley:1987:AWN, author = "Gilman Chesley", title = "Addressable {WSI}: a non-redundant approach", journal = j-COMP-ARCH-NEWS, volume = "15", number = "1", pages = "73--80", month = mar, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Biswas:1987:CCS, author = "Nripendra N. Biswas and S. Srinivas and Trishala Dharanendra", title = "A centrally controlled shuffle network for reconfigurable and fault-tolerant architecture", journal = j-COMP-ARCH-NEWS, volume = "15", number = "1", pages = "81--87", month = mar, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ditzel:1987:BFC, author = "D. R. Ditzel and H. R. McLellan", title = "Branch folding in the {CRISP} microprocessor: reducing branch delay to zero", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "2--8", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{DeRosa:1987:EBA, author = "J. A. DeRosa and H. M. Levy", title = "An evaluation of branch architectures", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "10--16", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hwu:1987:CRO, author = "W. W. Hwu and Y. N. Patt", title = "Checkpoint repair for out-of-order execution machines", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "18--26", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sohi:1987:IIL, author = "G. S. Sohi and S. Vajapeyam", title = "Instruction issue logic for high-performance, interruptible pipelined processors", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "27--34", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Swensen:1987:FTS, author = "J. Swensen and Y. Patt", title = "Fast temporary storage for serial and parallel execution", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "35--43", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wong:1987:PAD, author = "K. Wong and M. A. Franklin", title = "Performance analysis and design of a logic simulation machine", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "46--55", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Doshi:1987:MSA, author = "K. Doshi and P. Varman", title = "A modular systolic architecture for image convolutions", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "56--63", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fujita:1987:TMA, author = "S. Fujita and R. Aibara and M. Yamashita and T. Ae", title = "A template matching algorithm using optically-connected {$3$-D} {VLSI} architecture", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "64--70", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mendelson:1987:MDF, author = "B. Mendelson and G. M. Silberman", title = "Mapping data flow programs on a {VLSI} array of processors", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "72--80", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ghosal:1987:AMA, author = "D. Ghosal and L. N. Bhuyan", title = "Analytical modeling and architectural modifications of a dataflow computer", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "81--89", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Takesue:1987:URM, author = "M. Takesue", title = "A unified resource management and execution control mechanism for data flow machines", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "90--97", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Abe:1987:HPI, author = "S. Abe and T. Bandoh and S. Yamaguchi and K. Kurosawa and K. Kiriyama", title = "High performance integrated {Prolog} processor {IPP}", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "100--107", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fagin:1987:PSP, author = "B. S. Fagin and A. M. Despain", title = "Performance studies of a parallel {Prolog} architecture", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "108--116", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Civera:1987:EVP, author = "P. L. Civera and F. Maddaleno and G. L. Piccinini and M. Zamboni", title = "An experimental {VLSI} {Prolog} interpreter: preliminary measurements and results", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "117--126", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ridoux:1987:DSM, author = "O. Ridoux", title = "Deterministic and stochastic modeling of parallel garbage collection: towards real-time criteria", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "128--136", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sun:1987:SEP, author = "C. Sun and Y. Tsu", title = "The sharing of environment in {AND--OR}-parallel execution of logic programs", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "137--144", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Guha:1987:AID, author = "A. Guha and R. Ramnarayan and M. Derstine", title = "Architectural issues in designing symbolic processors in optics", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "145--151", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Varma:1987:RMS, author = "A. Varma and C. S. Raghavendra", title = "Rearrangeability of multistage shuffle\slash exchange networks", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "154--162", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Beivide:1987:OMC, author = "R. Beivide and E. Herrada and J. L. Balcazar and J. Labarta", title = "Optimized mesh-connected networks for {SIMD} and {MIMD} architectures", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "163--170", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Harper:1987:PER, author = "D. T. {Harper III} and J. R. Jump", title = "Performance evaluation of reduced bandwidth multistage interconnection networks", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "171--175", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ramachandran:1987:HSI, author = "U. Ramachandran and M. Solomon and M. Vernon", title = "Hardware support for interprocess communication", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "178--188", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dally:1987:AMD, author = "W. J. Dally and L. Chao and A. Chien and S. Hassoun and W. Horwat and J. Kaplan and P. Song and B. Totty and S. Wills", title = "Architecture of a message-driven processor", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "189--196", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kumar:1987:ESA, author = "M. Kumar", title = "Effect of storage allocation\slash reclamation methods on parallelism and storage requirements", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "197--205", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chang:1987:CDS, author = "J. H. Chang and H. Chao and K. So", title = "Cache design of a sub-micron {CMOS} {System\slash 370}", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "208--213", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Freeman:1987:APM, author = "M. Freeman", title = "An architectural perspective on a memory access controller", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "214--223", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cheung:1987:OAG, author = "K. Cheung and G. Sohi and K. Saluja and D. Pradhan", title = "Organization and analysis of a gracefully-degrading interleaved memory system", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "224--231", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Scheurich:1987:CMO, author = "C. Scheurich and M. Dubois", title = "Correct memory operation of cache-based multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "234--243", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wilson:1987:HCB, author = "A. W. {Wilson, Jr.}", title = "Hierarchical cache\slash bus architecture for shared memory multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "244--252", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:1987:MCD, author = "R. L. Lee and P. C. Yew and D. H. Lawrie", title = "Multiprocessor cache design considerations", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "253--262", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Eickemeyer:1987:PEM, author = "R. J. Eickemeyer and J. H. Patel", title = "Performance evaluation of multiple register sets", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "264--271", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stanley:1987:PAA, author = "T. J. Stanley and R. G. Wedig", title = "A performance analysis of automatically managed top of stack buffers", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "272--281", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Moore:1987:CSV, author = "B. Moore and A. Padegs and R. Smith and W. Buchholz", title = "Concepts of the {System\slash 370} vector architecture", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "282--288", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pleszkun:1987:WRA, author = "A. R. Pleszkun and J. R. Goodman and W. C. Hsu and R. T. Joersz and G. Bier and P. Woest and P. B. Schechter", title = "{WISQ}: a restartable architecture using queues", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "290--299", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chow:1987:ATD, author = "P. Chow and M. Horowitz", title = "Architectural tradeoffs in the design of {MIPS-X}", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "300--308", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ditzel:1987:HAC, author = "D. R. Ditzel and H. R. McLellan and A. D. Berenbaum", title = "The hardware architecture of the {CRISP} microprocessor", journal = j-COMP-ARCH-NEWS, volume = "15", number = "2", pages = "309--319", year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 16:49:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Moore:1987:BDN, author = "Matthew Moore and Charles McDowell", title = "Bi-directional networks for large parallel processors", journal = j-COMP-ARCH-NEWS, volume = "15", number = "3", pages = "3--4", month = jun, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kaplan:1987:LLG, author = "Ian Kaplan", title = "The {LDF 100}: a large grain dataflow parallel processor", journal = j-COMP-ARCH-NEWS, volume = "15", number = "3", pages = "5--12", month = jun, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lass:1987:WCC, author = "Stanley Lass", title = "Wide channel computers", journal = j-COMP-ARCH-NEWS, volume = "15", number = "3", pages = "13--16", month = jun, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bril:1987:IIA, author = "Reinder J. Bril", title = "An implementation independent approach to cache memories", journal = j-COMP-ARCH-NEWS, volume = "15", number = "3", pages = "17--24", month = jun, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bril:1987:CLV, author = "Reinder J. Bril", title = "On cacheability of lock-variables in tightly coupled multiprocessor systems", journal = j-COMP-ARCH-NEWS, volume = "15", number = "3", pages = "25--32", month = jun, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Iliffe:1987:FLM, author = "J. K. Iliffe", title = "A forward-looking method of {Cache} memory control", journal = j-COMP-ARCH-NEWS, volume = "15", number = "4", pages = "4--10", month = sep, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bandyopadhyay:1987:CBM, author = "Amitava Bandyopadhyay and Yuan F. Zheng", title = "Combining both microcode and hardwired control in {RISC}", journal = j-COMP-ARCH-NEWS, volume = "15", number = "4", pages = "11--15", month = sep, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dowd:1987:ERV, author = "Martin Dowd", title = "An example {RISC} vector machine architecture", journal = j-COMP-ARCH-NEWS, volume = "15", number = "4", pages = "16--22", month = sep, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bhatia:1987:MIN, author = "Sanjiv K. Bhatia and A. G. Starling", title = "Multilayered {Illiac} network scheme", journal = j-COMP-ARCH-NEWS, volume = "15", number = "4", pages = "23--31", month = sep, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nowak:1987:SGP, author = "Lothar Nowak", title = "{SAMP:a} general purpose processor based on a self-timed {VLIW} structure", journal = j-COMP-ARCH-NEWS, volume = "15", number = "4", pages = "32--39", month = sep, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ashenden:1987:LWP, author = "Peter J. Ashenden and Chris J. Barter and Chris D. Marlin", title = "The {Leopard} workstation project", journal = j-COMP-ARCH-NEWS, volume = "15", number = "4", pages = "40--51", month = sep, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chiang:1987:DEL, author = "Y. P. Chiang and M. L. Manwaring", title = "Direct execution {Lisp} and cell memory", journal = j-COMP-ARCH-NEWS, volume = "15", number = "4", pages = "52--57", month = sep, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Terry:1987:FCM, author = "J. M. Terry", title = "Flow-control machines:the structured execution architecture {(SXA)}", journal = j-COMP-ARCH-NEWS, volume = "15", number = "4", pages = "58--69", month = sep, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wirth:1987:HAP, author = "Niklaus Wirth", title = "Hardware architectures for programming languages and programming languages for hardware architectures", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "2--8", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Beck:1987:VAM, author = "Bob Beck and Bob Kasten and Shreekant Thakkar", title = "{VLSI} assist for a multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "10--20", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bisiani:1987:ASM, author = "Roberto Bisiani and Alessandro Forin", title = "Architectural support for multilanguage parallel programming on heterogeneous systems", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "21--30", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rashid:1987:MIV, author = "Richard Rashid and Avadis Tevanian and Michael Young and David Golub and Robert Baron", title = "Machine-independent virtual memory management for paged uniprocessor and multiprocessor architectures", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "31--39", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hayes:1987:ADE, author = "John R. Hayes and Martin E. Fraeman and Robert L. Williams and Thomas Zaremba", title = "An architecture for the direct execution of the {Forth} programming language", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "42--49", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Steenkiste:1987:TTC, author = "Peter Steenkiste and John Hennessy", title = "Tags and type checking in {LISP}: hardware and software approaches", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "50--59", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Davidson:1987:EIS, author = "Jack W. Davidson and Richard A. Vaughan", title = "The effect of instruction set complexity on program size and memory performance", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "60--64", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Atkinson:1987:DP, author = "Russell R. Atkinson and Edward M. McCreight", title = "The dragon processor", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "65--69", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Goodman:1987:CMV, author = "James R. Goodman", title = "Coherency for multiprocessor virtual address caches", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "72--81", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cargill:1987:CHS, author = "T. A. Cargill and B. N. Locanthi", title = "Cheap hardware support for software debugging and profiling", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "82--83", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Georgiou:1987:ECI, author = "C. J. Georgiou and S. L. Palmer and P. L. Rosenfeld", title = "An experimental coprocessor for implementing persistent objects on an {IBM 4381}", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "84--87", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Magenheimer:1987:IMD, author = "Daniel J. Magenheimer and Liz Peters and Karl Pettis and Dan Zuras", title = "Integer multiplication and division on the {HP} precision architecture", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "90--99", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wall:1987:MEU, author = "David W. Wall and Michael L. Powell", title = "The {Mahler} experience: using an intermediate language as the machine description", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "100--104", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Weiss:1987:SSC, author = "Shlomo Weiss and James E. Smith", title = "A study of scalar compilation techniques for pipelined supercomputers", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "105--109", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bush:1987:CSR, author = "William R. Bush and A. Dain Samples and David Ungar and Paul N. Hilfinger", title = "Compiling {Smalltalk-80} to a {RISC}", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "112--116", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chow:1987:HMA, author = "F. Chow and S. Correll and M. Himelstein and E. Killian and L. Weber", title = "How many addressing modes are enough?", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "117--121", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Massalin:1987:SLS, author = "Henry Massalin", title = "{Superoptimizer}: a look at the smallest program", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "122--126", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Taki:1987:PAE, author = "Kazuo Taki and Katzuto Nakajima and Hiroshi Nakashima and Morihiro Ikeda", title = "Performance and architectural evaluation of the {PSI} machine", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "128--135", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Borriello:1987:RVC, author = "Gaetano Borriello and Andrew R. Cherenson and Peter B. Danzig and Michael N. Nelson", title = "{RISCs} vs. {CISCs} for {Prolog}: a case study", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "136--145", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kieburtz:1987:RAS, author = "Richard B. Kieburtz", title = "A {RISC} architecture for symbolic computation", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "146--155", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ditzel:1987:DTS, author = "David R. Ditzel and Hubert R. McLellan and Alan D. Berenbaum", title = "Design tradeoffs to support the {C} programming language in the {CRISP} microprocessor", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "158--163", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thacker:1987:FMW, author = "Charles P. Thacker and Lawrence C. Stewart", title = "{Firefly}: a multiprocessor workstation", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "164--172", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Clark:1987:PPV, author = "Douglas W. Clark", title = "Pipelining and performance in the {VAX 8800} processor", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "173--177", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Colwell:1987:VAT, author = "Robert P. Colwell and Robert P. Nix and John J. O'Donnell and David B. Papworth and Paul K. Rodman", title = "A {VLIW} architecture for a trace scheduling compiler", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "180--192", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Levinthal:1987:PCG, author = "Adam Levinthal and Pat Hanrahan and Mike Paquette and Jim Lawson", title = "Parallel computers for graphics applications", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "193--198", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smith:1987:ZCP, author = "J. E. Smith and G. E. Dermer and B. D. Vanderwarn and S. D. Klinger and C. M. Rozewski", title = "The {ZS-1} central processor", journal = j-COMP-ARCH-NEWS, volume = "15", number = "5", pages = "199--204", month = oct, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:25 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Frietman:1987:EOD, author = "E. E. E. Frietman and A. B. Ruighaver", title = "An electro-optic data communication system for the {Delft} parallel processor", journal = j-COMP-ARCH-NEWS, volume = "15", number = "6", pages = "2--8", month = dec, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:28 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shippen:1987:TTD, author = "G. B. Shippen and J. K. Archibald", title = "A tagged token dataflow machine for computing small, iterative algorithms", journal = j-COMP-ARCH-NEWS, volume = "15", number = "6", pages = "9--18", month = dec, year = "1987", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:28 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Penn:1988:PSI, author = "Clif Penn", title = "Preface to the {Special} issue on {Neural Networks}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "1", pages = "6--6", month = mar, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lippmann:1988:ICN, author = "Richard P. Lippmann", title = "An introduction to computing with neural nets", journal = j-COMP-ARCH-NEWS, volume = "16", number = "1", pages = "7--25", month = mar, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anderson:1988:SNN, author = "James A. Anderson and Edward J. Wisniewski and Susan R. Viscuso", title = "Software for neural networks", journal = j-COMP-ARCH-NEWS, volume = "16", number = "1", pages = "26--36", month = mar, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Garth:1988:ISN, author = "Simon Garth and Danny Pike", title = "An integrated system for neural network simulations", journal = j-COMP-ARCH-NEWS, volume = "16", number = "1", pages = "37--44", month = mar, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Maren:1988:CRI, author = "A. Jean Maren", title = "Conference report: {IEEE First International Conference on Neural Networks}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "1", pages = "45--46", month = mar, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dongarra:1988:PVC, author = "Jack J. Dongarra", title = "Performance of various computers using standard linear equations software in a {FORTRAN} environment", journal = j-COMP-ARCH-NEWS, volume = "16", number = "1", pages = "47--69", month = mar, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wulf:1988:WCA, author = "Wm. A. Wulf", title = "The {WM} computer architecture", journal = j-COMP-ARCH-NEWS, volume = "16", number = "1", pages = "70--84", month = mar, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tabak:1988:LIM, author = "Daniel Tabak", title = "Logarithmic indices for multiprocessor evaluation", journal = j-COMP-ARCH-NEWS, volume = "16", number = "1", pages = "85--90", month = mar, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dowd:1988:ERV, author = "Martin Dowd", title = "An example {RISC} vector machine architecture", journal = j-COMP-ARCH-NEWS, volume = "16", number = "1", pages = "91--99", month = mar, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dowd:1988:RVC, author = "Martin Dowd", title = "{RISC} vector {CPU}'s and crossbars in desktops", journal = j-COMP-ARCH-NEWS, volume = "16", number = "1", pages = "100--102", month = mar, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lass:1988:MIO, author = "Stanley Lass", title = "Multiple instructions\slash operands per access to cache memory", journal = j-COMP-ARCH-NEWS, volume = "16", number = "1", pages = "103--103", month = mar, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gass:1988:WRS, author = "Wanda Gass", title = "Workshop report: synthesis of foo bars", journal = j-COMP-ARCH-NEWS, volume = "16", number = "1", pages = "104--108", month = mar, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ferguson:1988:BRL, author = "F. Joel Ferguson", title = "Book Review: {{\em Logic Design Principles\/}} by {Edward J. McCluskey, Prentice-Hall Publishers, Englewood Cliffs, New Jersey, 549 pp., \$39.95}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "1", pages = "109--109", month = mar, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:31 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ghosh:1988:CIM, author = "J. Ghosh and K. Hwang", title = "Critical issues in mapping neural networks on message-passing multicomputers", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "3--11", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Takefuji:1988:MCS, author = "Y. Takefuji and R. Jannarone and Y. B. Cho and T. Chen", title = "Multinomial conjunctoid statistical learning machines", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "12--17", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Louri:1988:BPA, author = "A. Louri and K. Hwang", title = "A bit-plane architecture for optical computing with two-dimensional symbolic substitution", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "18--27", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fiske:1988:RAP, author = "S. Fiske and W. J. Dally", title = "The reconfigurable arithmetic processor", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "30--36", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pleszkun:1988:PPM, author = "A. R. Pleszkun and G. S. Sohi", title = "The performance potential of multiple functional unit processors", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "37--44", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hwu:1988:EPM, author = "W. W. Hwu and P. P. Chang", title = "Exploiting parallel microprocessor microarchitectures with a compiler code generator", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "45--53", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McNiven:1988:AMR, author = "G. D. McNiven and E. S. Davidson", title = "Analysis of memory referencing behavior for design of local memories", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "56--63", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Eickenmeyer:1988:PEC, author = "R. J. Eickenmeyer and J. H. Patel", title = "Performance evaluation of on-chip register and cache organizations", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "64--72", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Baer:1988:IPM, author = "J.-L. Baer and W.-H. Wang", title = "On the inclusion properties for multi-level cache hierarchies", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "73--80", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Short:1988:SST, author = "R. T. Short and H. M. Levy", title = "A simulation study of two-level caches", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "81--88", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chow:1988:HNH, author = "E. Chow and H. Madan and J. Peterson and D. Grunwald and D. Reed", title = "Hyperswitch network for the hypercube computer", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "90--99", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Winsor:1988:ABH, author = "D. C. Winsor and T. N. Mudge", title = "Analysis of bus hierarchies for multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "100--107", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wei:1988:EGN, author = "S. Wei and G. Lee", title = "Extra group network: a cost-effective fault-tolerant multistage interconnection network", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "108--115", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jiang:1988:PMB, author = "H. Jiang and K. C. Smith", title = "A partial-multiple-bus computer structure with improved cost effectiveness", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "116--122", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Watson:1988:FPA, author = "I. Watson and V. Woods and P. Watson and R. Banach and M. Greenberg and J. Sargeant", title = "{Flagship}: a parallel architecture for declarative programming", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "124--130", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Iannucci:1988:TDN, author = "R. A. Iannucci", title = "Toward a dataflow\slash {von Neumann} hybrid architecture", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "131--140", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Culler:1988:RRD, author = "D. E. Culler and Arvind", title = "Resource requirements of dataflow programs", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "141--150", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sprunt:1988:PDP, author = "B. Sprunt and D. Kirk and L. Sha", title = "Priority-driven, preemptive {I/O} controllers for real-time systems", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "152--159", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shukla:1988:KIP, author = "S. B. Shukla and D. P. Agrawal", title = "A kernel-independent, pipelined architecture for real-time {$2$-D} convolution", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "160--166", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Liu:1988:EBL, author = "W. Liu and T.-F. Yeh and W. E. Batchelor and R. Cavin", title = "Exploiting bit level concurrency in real-time geometric feature extractions", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "167--174", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Clark:1988:MVP, author = "D. W. Clark and P. J. Bannon and J. B. Keller", title = "Measuring {VAX 8800} performance with a histogram hardware monitor", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "176--185", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sites:1988:MCA, author = "R. L. Sites and A. Agarwal", title = "Multiprocessor cache analysis using {ATUM}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "186--195", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ng:1988:TOB, author = "S. Ng and D. Lang and R. Selinger", title = "Trade-offs between devices and paths in achieving disk interleaving", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "196--201", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jainandunsing:1988:DCC, author = "K. Jainandunsing and E. F. Deprettere", title = "Design of a concurrent computer for solving systems of linear equations", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "204--211", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wolfe:1988:WDH, author = "A. Wolfe and M. {Breternitz, Jr.} and C. Stephens and A. L. Ting and D. B. Kirk and R. P. {Bianchini, Jr.} and J. P. Shen", title = "The white dwarf: a high-performance application-specific processor", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "212--222", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gaudiot:1988:SPD, author = "J. L. Gaudiot and C. M. Lin and M. Hosseiniyar", title = "Solving partial differential equations in a data-driven multiprocessor environment", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "223--230", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:1988:SSP, author = "D. Lee", title = "Scrambled storage for parallel memory systems", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "232--239", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Krishnaswamy:1988:ALC, author = "V. Krishnaswamy and S. Ahuja and N. Carriero and D. Gelernter", title = "The architecture of a {Linda} coprocessor", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "240--249", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kung:1988:DAS, author = "H. T. Kung", title = "Deadlock avoidance for systolic communication", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "252--260", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{So:1988:CPV, author = "K. So and V. Zecca", title = "Cache performance of vector processors", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "261--268", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vernon:1988:DRR, author = "M. K. Vernon and U. Manber", title = "Distributed round-robin and first-come first-serve protocols and their applications to multiprocessor bus arbitration", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "269--279", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Agarwal:1988:EDS, author = "A. Agarwal and R. Simoni and J. Hennessy and M. Horowitz", title = "An evaluation of directory schemes for cache coherence", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "280--298", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Prybylski:1988:PTC, author = "S. Prybylski and M. Horowitz and J. Hennessy", title = "Performance tradeoffs in cache design", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "290--298", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cheong:1988:CCS, author = "H. Cheong and A. V. Vaidenbaum", title = "A cache coherence scheme with fast selective invalidation", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "299--307", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vernon:1988:AEP, author = "M. K. Vernon and E. D. Lazowska and J. Zahorjan", title = "An accurate and efficient performance analysis technique for multiprocessor snooping cache-consistency protocols", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "308--315", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rau:1988:DTR, author = "D. Rau and J. A. B. Fortes and H. J. Siegel", title = "Destination tag routing techniques based on a state model for the {LADM} network", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "318--324", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kim:1988:RCB, author = "D. W. Kim and G. J. Lipovski and A. Hartmann and R. Jenevein", title = "Regular {CC}-banyan networks", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "325--332", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jenevein:1988:TAR, author = "R. M. Jenevein and T. Mookken", title = "Traffic analysis of rectangular {SW}-banyan networks", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "333--342", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tamir:1988:HPM, author = "Y. Tamir and G. L. Frazier", title = "High-performance multi-queue buffers for {VLSI} communications switches", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "343--354", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Preiss:1988:CBM, author = "B. R. Preiss and V. C. Hamacher", title = "A cache-based message passing scheme for a shared-bus multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "358--364", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Boku:1988:IHP, author = "T. Boku and S. Nomura and H. Amano", title = "{IMPULSE}: a high performance processing unit for multiprocessors for scientific calculation", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "365--372", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Eggers:1988:CSP, author = "S. J. Eggers and R. H. Katz", title = "A characterization of sharing in parallel programs and its application to coherency protocol evaluation", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "373--382", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lipovski:1988:FOI, author = "G. J. Lipovski and P. Vaughan", title = "A fetch-and-op implementation for parallel computers", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "384--392", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Seznec:1988:SPT, author = "A. Seznec and Y. J{\'e}gou", title = "Synchronizing processors through memory requests in a tightly coupled multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "393--400", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fujimoto:1988:DPS, author = "R. M. Fujimoto and J.-J. Tsai and G. Gopalakrishnan", title = "Design and performance of special purpose hardware for time warp", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "401--409", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cheriton:1988:VMI, author = "D. R. Cheriton and A. Gupta and P. D. Boyle and H. A. Goosen", title = "The {VMP} multiprocessor: initial experience, refinements, and performance evaluation", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "410--421", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Goodman:1988:WMN, author = "J. R. Goodman and P. J. Woest", title = "The {Wisconsin} multicube: a new large-scale cache-coherent multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "422--431", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tick:1988:DBP, author = "E. Tick", title = "Data buffer performance for sequential {Prolog} architectures", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "434--442", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Halstead:1988:MMP, author = "R. H. {Halstead, Jr.} and T. Fujita", title = "{MASA}: a multithreaded processor architecture for parallel symbolic computing", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "443--451", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Butler:1988:PAO, author = "P. L. Butler and J. D. {Allen, Jr.} and D. W. Bouldin", title = "Parallel architecture for {OPS5}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "452--457", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cheriton:1988:CCM, author = "David R. Cheriton and Pat Boyle and Gert A. Slavenburg", title = "Comments on {``Coherency for multiprocessor virtual addresses caches''} by {James R. Goodman}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "3", pages = "3--6", month = jun, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Goodman:1988:RDR, author = "James R. Goodman", title = "Reply to {David R. Cheriton's, Pat Boyle's, and Gert A. Slavenburg's ``Comments on 'Coherency for multiprocessor virtual addressed caches''\,' by James R. Goodman}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "3", pages = "7--7", month = jun, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rabbat:1988:TDC, author = "Guy Rabbat and Borko Furht and Ron Kibler", title = "Three-dimensional computers and measuring their performance", journal = j-COMP-ARCH-NEWS, volume = "16", number = "3", pages = "9--16", month = jun, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Castan:1988:MPG, author = "M. Castan and A. Contessa and E. Cousin and C. Coustet and B. Lecussan", title = "{MaRs}: a parallel graph reduction multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "16", number = "3", pages = "17--24", month = jun, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Contessa:1988:AFT, author = "Alessandro Contessa", title = "An approach to fault tolerance and error recovery in a parallel graph reduction machine: {MaRS}---a case study", journal = j-COMP-ARCH-NEWS, volume = "16", number = "3", pages = "25--32", month = jun, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Crawford:1988:EHH, author = "Chuck Crawford", title = "Evolution of the {Harris H-series} computers and speculations on their future", journal = j-COMP-ARCH-NEWS, volume = "16", number = "3", pages = "33--39", month = jun, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Good:1988:SIC, author = "Philip L. Good", title = "Structuring an instruction cache", journal = j-COMP-ARCH-NEWS, volume = "16", number = "3", pages = "40--43", month = jun, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Johnson:1988:CMM, author = "Eric E. Johnson", title = "Completing an {MIMD} multiprocessor taxonomy", journal = j-COMP-ARCH-NEWS, volume = "16", number = "3", pages = "44--47", month = jun, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jones:1988:UR, author = "Douglas W. Jones", title = "The ultimate {RISC}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "3", pages = "48--55", month = jun, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jones:1988:MC, author = "Douglas W. Jones", title = "A minimal {CISC}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "3", pages = "56--63", month = jun, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lass:1988:SCM, author = "Stanley Lass", title = "Shared cache multiprocessing with pack computers", journal = j-COMP-ARCH-NEWS, volume = "16", number = "3", pages = "64--70", month = jun, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jouppi:1988:SVS, author = "Norman P. Jouppi", title = "Superscalar vs. superpipelined machines", journal = j-COMP-ARCH-NEWS, volume = "16", number = "3", pages = "71--80", month = jun, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Schachter:1988:BRH, author = "Lorne H. Schachter", title = "Book review of {{\em High-Performance Computer Architecture\/}} by {Harold S. Stone. Addison-Wesley 1987}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "3", pages = "81--84", month = jun, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ramachandran:1988:PSI, author = "Umakishore Ramachandran", title = "Preface to the {Special Issue on Architectural Support for Operating Systems}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "11--11", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Asthana:1988:IMS, author = "A. Asthana and H. V. Jagadish and J. A. Chandross and D. Lin and S. C. Knauer", title = "An intelligent memory system", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "12--20", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Beltrametti:1988:CMM, author = "Monica Beltrametti and Kenneth Bobey and John R. Zorbas", title = "The control mechanism for the {Myrias} parallel computer system", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "21--30", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Finkel:1988:YSM, author = "Raphael Finkel and Debra Hengsen", title = "{YACKOS} on a shared-memory multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "31--36", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pucci:1988:OCE, author = "Marc F. Pucci and J. L. Alberi", title = "Optimized communication in an extended remote procedure call model", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "37--46", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cortadella:1988:DRC, author = "Jordi Cortadella and Teodor Jov{\'e}", title = "Dynamic {RAM} for on-chip instruction caches", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "45--50", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Naderi:1988:MPEa, author = "M. Naderi", title = "Modelling and performance evaluation of multiprocessors organization with shared memories", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "51--74", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gehringer:1988:SCP, author = "Edward Gehringer and Janne Abullarade and Michael H. Gulyn", title = "A survey of commercial parallel processors", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "75--107", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lease:1988:CPS, author = "Mark Lease and Mac Lively", title = "Comparing production system architectures", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "108--116", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Page:1988:FAH, author = "Ivor Page and Jeff Niehaus", title = "The {Flex} architecture, a high speed graphics processor", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "117--129", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Murakami:1988:OKU, author = "Kazuaki Murakami and Akira Fukuda and Toshinori Sueyoshi and Shinji Tomita", title = "An overview of the {Kyushu University} reconfigurable parallel processor", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "130--137", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Percus:1988:SRC, author = "Ora E. Percus and J. K. Percus", title = "Some results concerning clock-regulated queues", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "138--144", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Williams:1988:SSS, author = "Fleur Liane Williams", title = "Should {SCC} set condition codes?", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "145--149", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Steven:1988:NEA, author = "Gordon B. Steven", title = "A novel effective address calculation mechanism for {RISC} microprocessors", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "150--156", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Parhami:1988:DFV, author = "Behrooz Parhami", title = "From defects to failures: a view of dependable computing", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "157--168", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Patterson:1988:RP, author = "David A. Patterson", title = "{RISCY} patents", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "169--191", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Takacs:1988:BRV, author = "Helen C. Takacs", title = "Book review: {{\em A VLSI Architecture for Concurrent Data Structures\/}} by {William J. Dally (Kluwer 1988)}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "192--193", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Colwell:1988:BRC, author = "Robert P. Colwell", title = "Book review: {{\em Computer Architecture and Organization}}, 2nd ed. by {John P. Hayes (McGraw Hill, 1988)}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "193--195", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McDowell:1988:BRS, author = "Charles E. McDowell", title = "Book review: {{\em Supercomputer Architectures\/}} by {Paul B. Schneck (Kluwer Academic Publishers)}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "4", pages = "195--196", month = sep, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:11 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hum:1988:SWF, author = "Herbert H. J. Hum and Guang R. Gao", title = "Summary of the workshop on frontiers in functional programming and dataflow architecture", journal = j-COMP-ARCH-NEWS, volume = "16", number = "5", pages = "12--19", month = dec, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{vanTilborg:1988:IDC, author = "Andre M. van Tilborg", title = "Instrumentation for distributed computing systems", journal = j-COMP-ARCH-NEWS, volume = "16", number = "5", pages = "20--25", month = dec, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Griffin:1988:UUR, author = "Glenn W. Griffin", title = "The ultimate ultimate {RISC}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "5", pages = "26--32", month = dec, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jones:1988:RCR, author = "Douglas W. Jones", title = "Risks of comparing {RISCs}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "5", pages = "33--34", month = dec, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Naderi:1988:MPEb, author = "M. Naderi", title = "Modelling and performance evaluation of multiprocessors, organizations with multi-memory units", journal = j-COMP-ARCH-NEWS, volume = "16", number = "5", pages = "35--51", month = dec, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kogge:1988:VRB, author = "Peter Kogge and John Oldfield and Mark Brule and Charles Stormon", title = "{VLSI} and rule-based systems", journal = j-COMP-ARCH-NEWS, volume = "16", number = "5", pages = "52--65", month = dec, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Parhami:1988:BRM, author = "Behrooz Parhami", title = "Book review: {{\em Memory Storage Patterns in Parallel Processing\/}} by {Mary A. Mace (Kluwer Academic Publishers, Boston, 1987, 139 pp.)}", journal = j-COMP-ARCH-NEWS, volume = "16", number = "5", pages = "76--76", month = dec, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Moskowitz:1989:AMM, author = "J. P. Moskowitz and C. Jousselin", title = "An algebraic memory model", journal = j-COMP-ARCH-NEWS, volume = "17", number = "1", pages = "55--62", month = mar, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wong:1989:SAS, author = "W. F. Wong", title = "A stack addressing scheme based on windowing", journal = j-COMP-ARCH-NEWS, volume = "17", number = "1", pages = "63--69", month = mar, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:1989:PTD, author = "Anonymous", title = "Pipelining through {Dynamic Control ROM}", journal = j-COMP-ARCH-NEWS, volume = "17", number = "1", pages = "70--72", month = mar, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lass:1989:SIC, author = "Stanley E. Lass", title = "Some innovations in computer architecture", journal = j-COMP-ARCH-NEWS, volume = "17", number = "1", pages = "73--77", month = mar, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bitar:1989:BRR, author = "Philip Bitar", title = "Book reviews: Review of {{\em Parallel Execution of Logic Programs\/}} by {John Conery. Kluwer Academic Publishers 1987}", journal = j-COMP-ARCH-NEWS, volume = "17", number = "1", pages = "81--82", month = mar, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cohn:1989:ACT, author = "Robert Cohn and Thomas Gross and Monica Lam", title = "Architecture and compiler tradeoffs for a long instruction word processor", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "2--14", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sohi:1989:TIF, author = "Gurindar S. Sohi and Sriram Vajapeyam", title = "Tradeoffs in instruction format design for horizontal architectures", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "15--25", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dehnert:1989:OLS, author = "James C. Dehnert and Peter Y.-T. Hsu and Joseph P. Bratt", title = "Overlapped loop support in the {Cydra 5}", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "26--38", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Burkowski:1989:ASS, author = "F. J. Burkowski and G. V. Cormack and G. D. P. Dueck", title = "Architectural support for synchronous task communication", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "40--53", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gupta:1989:FBM, author = "Rajiv Gupta", title = "The fuzzy barrier: a mechanism for high speed synchronization of processors", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "54--63", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Goodman:1989:ESP, author = "James R. Goodman and Mary K. Vernon and Philip J. Woest", title = "Efficient synchronization primitives for large-scale cache-coherent multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "64--75", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mellor-Crummey:1989:SIC, author = "J. M. Mellor-Crummey and T. J. LeBlanc", title = "A software instruction counter", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "78--86", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Aral:1989:EDP, author = "Z. Aral and I. Gerther and G. Schaffer", title = "Efficient debugging primitives for multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "87--95", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Staknis:1989:SMA, author = "M. E. Staknis", title = "Sheaved memory: architectural support for state saving and restoration in pages systems", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "96--102", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Holliday:1989:RHP, author = "M. A. Holliday", title = "Reference history, page size, and migration daemons in local\slash remote architectures", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "104--112", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Black:1989:TLB, author = "D. L. Black and R. F. Rashid and D. B. Golub and C. R. Hill", title = "Translation lookaside buffer consistency: a software approach", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "113--122", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gibson:1989:FCT, author = "G. A. Gibson and L. Hellerstein and R. M. Karp and D. A. Patterson", title = "Failure correction techniques for large disk arrays", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "123--132", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jouppi:1989:UVS, author = "N. P. Jouppi and J. Bertoni and D. W. Wall", title = "A unified vector\slash scalar floating-point architecture", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "134--143", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mulder:1989:DBR, author = "H. Mulder", title = "Data buffering: run-time versus compile-time support", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "144--151", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Adams:1989:AIS, author = "T. L. Adams and R. E. Zimmerman", title = "An analysis of 8086 instruction set usage in {MS DOS} programs", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "152--160", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Roos:1989:RTS, author = "J. Roos", title = "A real-time support processor for {Ada} tasking", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "162--171", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vegdahl:1989:RES, author = "Steven R. Vegdahl and Uwe F. Pleban", title = "The runtime environment for {Scheme}, a {Scheme} implementation on the 88000", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "172--182", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McFarling:1989:POI, author = "S. McFarling", title = "Program optimization for instruction caches", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "183--191", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Karger:1989:URO, author = "Paul A. Karger", title = "Using registers to optimize cross-domain call performance", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "194--204", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Arnould:1989:DNN, author = "Emmanuel Arnould and H. T. Kung and Fran{\c{c}}ois Bitz and Robert D. Sansom and Eric C. Cooperm", title = "The design of nectar: a network backplane for heterogeneous multicomputers", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "205--216", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Delgado-Rannauro:1989:MDP, author = "S. A. Delgado-Rannauro and T. J. Reynolds", title = "A message driven {OR}-parallel machine", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "217--228", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Owicki:1989:EPS, author = "S. Owicki and A. Agarwal", title = "Evaluating the performance of software cache coherence", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "230--242", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Weber:1989:ACI, author = "W. Weber and A. Gupta", title = "Analysis of cache invalidation patterns in multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "243--256", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Eggers:1989:ESC, author = "S. J. Eggers and R. H. Katz", title = "The effect of sharing on the cache and bus performance of parallel programs", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "257--270", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jouppi:1989:AIL, author = "N. P. Jouppi and D. W. Wall", title = "Available instruction-level parallelism for superscalar and superpipelined machines", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "272--282", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dally:1989:MOF, author = "W. J. Dally", title = "Micro-optimization of floating-point operations", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "283--289", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smith:1989:LMI, author = "M. D. Smith and M. Johnson and M. A. Horowitz", title = "Limits on multiple instruction issue", journal = j-COMP-ARCH-NEWS, volume = "17", number = "2", pages = "290--302", month = apr, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:39 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Eggers:1989:EPF, author = "S. J. Eggers and R. H. Katz", title = "Evaluating the performance of four snooping cache coherency protocols", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "2--15", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cheriton:1989:MLS, author = "D. R. Cheriton and H. A. Goosen and P. D. Boyle", title = "Multi-level shared caching techniques for scalability in {VMP-M/C}", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "16--24", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Goto:1989:DPC, author = "A. Goto and A. Matsumoto and E. Tick", title = "Design and performance of a coherent cache for parallel logic programming architectures", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "25--33", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Grafe:1989:EDP, author = "V. G. Grafe and G. S. Davidson and J. E. Hoch and V. P. Holmes", title = "The {Epsilon} dataflow processor", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "36--45", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sakai:1989:ADS, author = "S. Sakai and y. Yamaguchi and K. Hiraki and Y. Kodama and T. Yuba", title = "An architecture of a dataflow single chip processor", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "46--53", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nitezki:1989:EDP, author = "P. Nitezki", title = "Exploiting data parallelism in signal processing on a dataflow machine", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "54--61", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ibbett:1989:AMS, author = "R. N. Ibbett and T. M. Hopkins and K. I. M. McKinnon", title = "Architectural mechanisms to support sparse vector processing", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "64--71", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Harper:1989:DSS, author = "D. T. Harper and D. A. Linebarger", title = "A dynamic storage scheme for conflict-free vector access", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "72--77", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Murakami:1989:SSI, author = "K. Murakami and N. Irie and S. Tomita", title = "{SIMP} (Single Instruction stream\slash Multiple instruction Pipelining): a novel high-speed single-processor architecture", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "78--85", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ben-Asher:1989:DSA, author = "Y. Ben-Asher and D. Egozi and A. Schuster", title = "{$2$-D SIMD} algorithms in the perfect shuffle networks", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "88--95", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Valero-Garcia:1989:SHA, author = "M. Valero-Garcia and J. J. Navarro and J. M. Llaberia and M. Valero", title = "Systematic hardware adaptation of systolic algorithms", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "96--104", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:1989:TMH, author = "M.-S. Chen and K. G. Shin", title = "Task migration in hypercube multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "105--111", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Przybylski:1989:CPO, author = "S. Przybylski and M. Horowitz and J. Hennessy", title = "Characteristics of performance-optimal multi-level cache hierarchies", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "114--121", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wood:1989:SRD, author = "D. A. Wood and R. H. Katz", title = "Supporting reference and dirty bits in {SPUR}'s virtual address cache", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "122--130", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kessler:1989:IIS, author = "R. E. Kessler and R. Jooss and A. Lebeck and M. D. Hill", title = "Inexpensive implementations of set-associativity", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "131--139", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wang:1989:OPT, author = "W. H. Wang and J.-L. Baer and H. M. Levy", title = "Organization and performance of a two-level virtual-real cache hierarchy", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "140--148", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jesshope:1989:HPC, author = "C. R. Jesshope and P. R. Miller and J. T. Yantchev", title = "High performance communications in processor networks", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "150--157", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mizrahi:1989:IMS, author = "H. E. Mizrahi and J. L. Baer and E. D. Lazowska and J. Zahorjan", title = "Introducing memory into the switch elements of multiprocessor interconnection networks", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "158--166", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Scott:1989:UFC, author = "S. L. Scott and G. S. Sohi", title = "Using feedback to control tree saturation in multistage interconnection networks", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "167--176", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ezhilchelvan:1989:CRS, author = "P. D. Ezhilchelvan and S. K. Shrivastava and A. Tully", title = "Constructing replicated systems using processors with point-to-point communication links", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "177--184", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Benker:1989:KKC, author = "H. Benker and J. M. Beacco and M. Dorochevsky and Th. Jeffr{\'e} and A. P{\"o}hlmann and J. Noy{\'e} and B. Poterie and J. C. Syre and O. Thibault and G. Watzlawik", title = "{KCM}: a knowledge crunching machine", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "186--194", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Singhal:1989:HPP, author = "A. Singhal and Y. N. Patt", title = "A high performance {Prolog} processor with multiple function units", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "195--202", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Morioka:1989:EMS, author = "M. Morioka and S. Yamaguchi and T. Bandoh", title = "Evaluation of memory system for integrated {Prolog} processor {IPP}", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "203--210", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wong:1989:TDH, author = "K.-F. Wong and M. H. Williams", title = "A type driven hardware engine for {Prolog} clause retrieval over a large knowledge base", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "211--222", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hwu:1989:CSH, author = "W. W. Hwu and T. M. Conte and P. P. Chang", title = "Comparing software and hardware schemes for reducing the cost of branches", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "224--233", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Farrens:1989:IPS, author = "M. K. Farrens and a. R. Pleszkun", title = "Improving performance of small on-chip instruction caches", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "234--241", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hwu:1989:AHI, author = "W. W. Hwu and P. P. Chang", title = "Achieving high instruction cache performance with an optimizing compiler", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "242--251", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Steenkiste:1989:ICD, author = "P. Steenkiste", title = "The impact of code density on instruction cache performance", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "252--259", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nikhil:1989:CDS, author = "R. S. Nikhil", title = "Can dataflow subsume {von Neumann} computing?", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "262--272", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Weber:1989:EBM, author = "W.-D. Weber and A. Gupta", title = "Exploring the benefits of multiple hardware contexts in a multiprocessor architecture: preliminary results", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "273--280", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jouppi:1989:AOT, author = "N. P. Jouppi", title = "Architectural and organizational tradeoffs in the design of the {MultiTitan CPU}", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "281--289", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sato:1989:RTC, author = "M. Sato and S. Ichikawa and E. Goto", title = "Run-time checking in {Lisp} by integrating memory addressing and range checking", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "290--297", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hopper:1989:MVW, author = "A. Hopper and A. Jones and D. Lioupis", title = "Multiple vs. wide shared bus multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "300--306", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Annaratone:1989:PMC, author = "M. Annaratone and R. R{\"u}hl", title = "Performance measurements on a commercial multiprocessor running parallel code", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "307--314", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Annaratone:1989:ICS, author = "M. Annaratone and C. Pommerell and R. R{\"u}hl", title = "Interprocessor communication speed and performance in distributed-memory parallel processors", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "315--324", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ghosal:1989:ACC, author = "D. S. Ghosal and S. K. Tripathi and L. N. Bhuyan and H. Jiang", title = "Analysis of computation-communication issues in dynamic dataflow architectures", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "325--333", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kravitz:1989:LSM, author = "S. Kravitz and R. E. Bryant and R. Rutenbar", title = "Logic simulation on massively parallel architectures", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "336--343", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fukazawa:1989:RRP, author = "T. Fukazawa and T. Kimura and M. Tomizawa and K. Takeda and Y. Itoh", title = "{R256}: a research parallel processor for scientific computation", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "344--351", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anido:1989:TPT, author = "M. L. Anido and D. J. Allerton and E. J. Zaluska", title = "A three-port\slash three-access register file for concurrent processing and {I/O} communication in a {RISC}-like graphics engine", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "354--361", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mulder:1989:AFA, author = "J. M. Mulder and R. J. Portier and A. Srivastava and R. in't Velt", title = "An architecture framework for application-specific and scalable architectures", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "362--369", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kim:1989:PLS, author = "K. Kim and V. K. Prasanna-Kumar", title = "Perfect {Latin} squares and parallel array access", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "372--379", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Weiss:1989:ASS, author = "S. Weiss", title = "An aperiodic storage scheme to reduce memory conflicts in vector processors", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "380--386", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:1989:AVA, author = "C.-L. Chen and C.-K. Liao", title = "Analysis of vector access performance on skewed interleaved memory", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "387--394", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Agarwal:1989:ABS, author = "A. Agarwal and M. Cherian", title = "Adaptive backoff synchronization techniques", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "396--406", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stenstrom:1989:CCP, author = "P. Stenstr{\"o}m", title = "A cache consistency protocol for multiprocessors with multistage networks", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "407--415", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Su:1989:DSM, author = "H.-M. Su and P.-C. Yew", title = "On data synchronization for multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "17", number = "3", pages = "416--423", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{vanTilborg:1989:PFD, author = "A. M. van Tilborg", title = "Panel on future directions in parallel computer architecture", journal = j-COMP-ARCH-NEWS, volume = "17", number = "4", pages = "3--53", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gunther:1989:PBS, author = "N. J. Gunther and M. T. Noga", title = "{ParcBench}: a benchmark for shared-memory architectures", journal = j-COMP-ARCH-NEWS, volume = "17", number = "4", pages = "54--61", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Elkateeb:1989:PSR, author = "A. Elkateeb and T. Le-Ngoc", title = "A priority strategy on {RISC} for real-time multitasking software applications", journal = j-COMP-ARCH-NEWS, volume = "17", number = "4", pages = "62--68", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Oyang:1989:MCA, author = "Y.-J. Oyang", title = "A multiprocessor configuration in accordance with the aspects of physical and systems design", journal = j-COMP-ARCH-NEWS, volume = "17", number = "4", pages = "69--73", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Seebauer:1989:MCEa, author = "H. Seebauer", title = "A memory controller executing segment operations in time {$ O(1) $}", journal = j-COMP-ARCH-NEWS, volume = "17", number = "4", pages = "74--81", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Schwartz:1989:DDD, author = "R. J. Schwartz", title = "The design and development of a dynamic program behavior measurement tool for the {Intel 8086\slash 88}", journal = j-COMP-ARCH-NEWS, volume = "17", number = "4", pages = "82--94", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Martin:1989:FAM, author = "A. J. Martin and S. M. Burns and T. K. Lee and D. Borkovic and P. J. Hazewindus", title = "The first asynchronous microprocessor: the test results", journal = j-COMP-ARCH-NEWS, volume = "17", number = "4", pages = "95--110", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cornett:1989:UMS, author = "F. Cornett", title = "The {UT1000} microprogramming simulator: an educational tool", journal = j-COMP-ARCH-NEWS, volume = "17", number = "4", pages = "111--118", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yuen:1989:BDD, author = "C. K. Yuen and W. F. Wong", title = "A bidirectional data driven {Lisp} engine for the direct execution of {Lisp} in parallel", journal = j-COMP-ARCH-NEWS, volume = "17", number = "4", pages = "119--130", month = jun, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smotherman:1989:SBT, author = "M. Smotherman", title = "A sequencing-based taxonomy of {I/O} systems and review of historical machines", journal = j-COMP-ARCH-NEWS, volume = "17", number = "5", pages = "5--15", month = sep, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cousins:1989:DCR, author = "R. Cousins", title = "{DMA} considerations on {RISC} workstations", journal = j-COMP-ARCH-NEWS, volume = "17", number = "5", pages = "16--23", month = sep, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Katz:1989:PHP, author = "R. H. Katz", title = "A project on high performance {I/O} subsystems", journal = j-COMP-ARCH-NEWS, volume = "17", number = "5", pages = "24--31", month = sep, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dibble:1989:BSB, author = "P. C. Dibble and M. L. Scott", title = "Beyond striping: the bridge multiprocessor file system", journal = j-COMP-ARCH-NEWS, volume = "17", number = "5", pages = "32--39", month = sep, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Reddy:1989:SPD, author = "A. L. N. Reddy and P. Banerjee", title = "A study parallel disk organizations", journal = j-COMP-ARCH-NEWS, volume = "17", number = "5", pages = "40--47", month = sep, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smith:1989:MRT, author = "J. M. Smith and G. Q. {Maguire, Jr.}", title = "Measured response times for page-sized fetches on a network", journal = j-COMP-ARCH-NEWS, volume = "17", number = "5", pages = "48--54", month = sep, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wolman:1989:ISI, author = "B. Wolman and T. M. Olson", title = "{IOBENCH}: a system independent {IO} benchmark", journal = j-COMP-ARCH-NEWS, volume = "17", number = "5", pages = "55--70", month = sep, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Oslon:1989:DAP, author = "T. M. Oslon", title = "Disk array performance in a random {IO} environment", journal = j-COMP-ARCH-NEWS, volume = "17", number = "5", pages = "71--77", month = sep, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wolman:1989:ASB, author = "B. L. Wolman", title = "An analysis of server-based locking", journal = j-COMP-ARCH-NEWS, volume = "17", number = "5", pages = "78--82", month = sep, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Debaere:1989:IPC, author = "E. H. Debaere", title = "Instruction-path coprocessing to solve some {RISC} problems", journal = j-COMP-ARCH-NEWS, volume = "17", number = "5", pages = "83--94", month = sep, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Seebauer:1989:MCEb, author = "H. Seebauer", title = "A memory controller executing segment operations in time {$ O(1) $}", journal = j-COMP-ARCH-NEWS, volume = "17", number = "5", pages = "95--102", month = sep, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chiu:1989:RLF, author = "P. K. Chiu", title = "Representation of logic functions by {\tt if--then} clauses", journal = j-COMP-ARCH-NEWS, volume = "17", number = "5", pages = "103--107", month = sep, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Baleanu:1989:ECC, author = "C. Baleanu and D. Tomescu", title = "Embedding computers in a cellular array", journal = j-COMP-ARCH-NEWS, volume = "17", number = "5", pages = "108--115", month = sep, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lass:1989:HES, author = "S. Lass", title = "On hardware enhanced 80386 software emulation, compiled emulation, a program distribution language, and pack computers", journal = j-COMP-ARCH-NEWS, volume = "17", number = "5", pages = "116--118", month = sep, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Litaize:1989:MSM, author = "Daniel Litaize and Omar Hammami and Mustapha Lalam and Adelaziz Mzoughi and Pascl Sinrat", title = "Multiprocessors with a serial multiport memory and a pseudo crossbar of serial links used as a processor-memory switch", journal = j-COMP-ARCH-NEWS, volume = "17", number = "6", pages = "8--21", month = dec, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fritsch:1989:DSM, author = "G. Fritsch and W. Henning and H. Hesenuer and R. Klar and C. U. Linster and C. w. Oehlrich and P. Schlenk and J. Vokert", title = "Distributed shared memory multiprocessor architecture {MEMSY} for high performance parallel computations", journal = j-COMP-ARCH-NEWS, volume = "17", number = "6", pages = "22--35", month = dec, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mendelson:1989:SCC, author = "A. Mendelson and D. K. Pradhan and A. D. Singh", title = "A single cached copy data coherence scheme for multiprocessor systems", journal = j-COMP-ARCH-NEWS, volume = "17", number = "6", pages = "36--49", month = dec, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Feitelson:1989:AMU, author = "Dror G. Feitelson and Larry Rudolph", title = "Architecture for a multi-user general-purpose parallel system", journal = j-COMP-ARCH-NEWS, volume = "17", number = "6", pages = "50--56", month = dec, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Quammen:1989:RWA, author = "D. Quammen and D. R. Miller and D. Tabak", title = "Register window architecture for multitasking applications", journal = j-COMP-ARCH-NEWS, volume = "17", number = "6", pages = "57--66", month = dec, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rosenberg:1989:EEI, author = "Arnold Rosenberg", title = "Efficient emulations of interconnection networks", journal = j-COMP-ARCH-NEWS, volume = "17", number = "6", pages = "67--79", month = dec, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Scherson:1989:DPC, author = "Isaac D. Scherson and Peter F. Corbett", title = "Description and performance of a class of orthogonal multiprocessor networks", journal = j-COMP-ARCH-NEWS, volume = "17", number = "6", pages = "80--90", month = dec, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{David:1989:EIB, author = "Llana David and Ran Ginosar and Michael Yoeli", title = "An efficient implementation of {Boolean} functions and finite state machine as self-timed circuit", journal = j-COMP-ARCH-NEWS, volume = "17", number = "6", pages = "91--104", month = dec, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dollan:1989:CSP, author = "Apostolos Dollan and Robert F. Krick", title = "The case for the sustained performance computer architecture", journal = j-COMP-ARCH-NEWS, volume = "17", number = "6", pages = "129--136", month = dec, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Johnson:1989:WSP, author = "Eric E. Johnson", title = "Working set prefetching for cache memories", journal = j-COMP-ARCH-NEWS, volume = "17", number = "6", pages = "137--141", month = dec, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:1989:MPC, author = "K. e H. Lee and C. H. Lam", title = "Massage-passing controller for a shared-memory multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "17", number = "6", pages = "142--149", month = dec, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hsu:1989:LCF, author = "Tsong-Chih Hsu and Ling-Yang Kung", title = "Logic and conflict-free vector addresses", journal = j-COMP-ARCH-NEWS, volume = "17", number = "6", pages = "150--153", month = dec, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hsu:1989:AGU, author = "Tsong-Chih Hsu and Ling-Yang Kung", title = "An address generation unit for array accessing", journal = j-COMP-ARCH-NEWS, volume = "17", number = "6", pages = "154--160", month = dec, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hsu:1989:HMP, author = "Tsong-Chih Hsu and Ling-Yang Kung", title = "A hardware mechanism for priority queue", journal = j-COMP-ARCH-NEWS, volume = "17", number = "6", pages = "162--169", month = dec, year = "1989", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dvorak:1990:MAS, author = "V. Dvorak", title = "Microsequencer architecture supporting arbitrary branching up to 2m targets", journal = j-COMP-ARCH-NEWS, volume = "18", number = "1", pages = "9--9", month = mar, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dongarra:1990:PVC, author = "Jack J. Dongarra", title = "Performance of various computers using standard linear equations software", journal = j-COMP-ARCH-NEWS, volume = "18", number = "1", pages = "17--17", month = mar, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hsu:1990:CFO, author = "Tsong---Chih Hsu and Ling---Yang Kung", title = "A comment on {``A Fetch-and-Op Implementation for Parallel Computers''}", journal = j-COMP-ARCH-NEWS, volume = "18", number = "1", pages = "32--32", month = mar, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cousins:1990:NAC, author = "Robert Cousins", title = "A novel approach to character interfaces", journal = j-COMP-ARCH-NEWS, volume = "18", number = "1", pages = "35--35", month = mar, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cousins:1990:RPI, author = "Robert Cousins", title = "A reentrant peripheral interface", journal = j-COMP-ARCH-NEWS, volume = "18", number = "1", pages = "43--43", month = mar, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anderson:1990:ACS, author = "Noel W. Anderson", title = "Amorphous computer system architecture: a preliminary look", journal = j-COMP-ARCH-NEWS, volume = "18", number = "1", pages = "51--51", month = mar, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Oyang:1990:CEA, author = "Yen-Jen Oyang and Bor-Ting Chang and Shu-May Lin", title = "A cost-effective approach to implement a long instruction word microprocessor", journal = j-COMP-ARCH-NEWS, volume = "18", number = "1", pages = "59--59", month = mar, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fritsch:1990:PBA, author = "C. Fritsch and T. S{\'a}nchez and J. Anaya", title = "Primitive based architectures", journal = j-COMP-ARCH-NEWS, volume = "18", number = "1", pages = "73--73", month = mar, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lorin:1990:MRC, author = "Harold Lorin", title = "A model for recentralization of computing: (distributed processing comes home)", journal = j-COMP-ARCH-NEWS, volume = "18", number = "1", pages = "81--81", month = mar, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Teodosiu:1990:CTD, author = "Dan Teodosiu", title = "Computing in three dimensions", journal = j-COMP-ARCH-NEWS, volume = "18", number = "1", pages = "99--99", month = mar, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Frazier:1990:ASM, author = "Gary Frazier", title = "{Ariel}: a scalable multiprocessor for the simulation of neural networks", journal = j-COMP-ARCH-NEWS, volume = "18", number = "1", pages = "107--107", month = mar, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Colwell:1990:BRH, author = "Robert P. Colwell", title = "Book review: {{\em High-Level Language Computer Architecture\/}} edited by {Veljko Milutinovic (Computer Science Press, 1989)}", journal = j-COMP-ARCH-NEWS, volume = "18", number = "1", pages = "120--122", month = mar, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Parhami:1990:BRA, author = "Behrooz Parhami", title = "Book review: {{\em Advanced Research in VLSI}}, edited by {Charles L. Seitz (The MIT Press, Cambridge, MA, 1989, 373 pp.)}", journal = j-COMP-ARCH-NEWS, volume = "18", number = "1", pages = "122--123", month = mar, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Matthes:1990:HRG, author = "Wolfgang Matthes", title = "Hardware {Resources}: a generalizing view on computer architectures", journal = j-COMP-ARCH-NEWS, volume = "18", number = "2", pages = "7--14", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rauchwerger:1990:MFP, author = "Lawrence Rauchwerger and Michael P. Farmwald", title = "A multiple floating point coprocessor architecture", journal = j-COMP-ARCH-NEWS, volume = "18", number = "2", pages = "15--24", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Glew:1990:SCT, author = "Andy Glew and Wen-Mei Hwu", title = "Snoopy cache test-and-test-and-set without excessive bus contention", journal = j-COMP-ARCH-NEWS, volume = "18", number = "2", pages = "25--32", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Higbee:1990:QEC, author = "Lee Higbee", title = "Quick and easy cache performance analysis", journal = j-COMP-ARCH-NEWS, volume = "18", number = "2", pages = "33--44", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Park:1990:ISF, author = "Arvin Park and Jeffrey C. Becker and Richard J. Lipton", title = "{IOStone}: a synthetic file system benchmark", journal = j-COMP-ARCH-NEWS, volume = "18", number = "2", pages = "45--52", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pnevmatikatos:1990:CPI, author = "Dionisios N. Pnevmatikatos and Mark D. Hill", title = "Cache performance of the integer {SPEC} benchmarks on a {RISC}", journal = j-COMP-ARCH-NEWS, volume = "18", number = "2", pages = "53--68", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ruighaver:1990:MND, author = "A. B. Ruighaver", title = "A modular network for dense optical interconnection of processing elements", journal = j-COMP-ARCH-NEWS, volume = "18", number = "2", pages = "69--75", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{DeGloria:1990:VVI, author = "Alessandro {De Gloria}", title = "{VISA}: a variable instruction set architecture", journal = j-COMP-ARCH-NEWS, volume = "18", number = "2", pages = "76--84", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Williams:1990:ADR, author = "Fleur L. Williams and Gordon B. Steven", title = "Address and data register separation on the {M68000} family", journal = j-COMP-ARCH-NEWS, volume = "18", number = "2", pages = "85--89", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Adve:1990:WON, author = "Sarita V. Adve and Mark D. Hill", title = "Weak ordering---a new definition", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "2--14", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gharachorloo:1990:MCE, author = "Kourosh Gharachorloo and Daniel Lenoski and James Laudon and Phillip Gibbons and Anoop Gupta and John Hennessy", title = "Memory consistency and event ordering in scalable shared-memory multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "15--26", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:1990:SMC, author = "Joonwon Lee and Umakishore Ramachandran", title = "Synchronization with multiprocessor caches", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "27--37", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chuang:1990:DPA, author = "Po-Jen Chuang and Nian-Feng Tzeng", title = "Dynamic processor allocation in hypercube computers", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "40--49", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Youssef:1990:NAF, author = "Abdou Youssef and Bruce Arden", title = "A new approach to fast control of $ r_2 \times r_2 $ $3$-stage {Benes} networks of $ r \times r$ crossbar switches", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "50--59", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dally:1990:VCF, author = "William J. Dally", title = "Virtual-channel flow control", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "60--68", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Borkar:1990:SSM, author = "Shekhar Borkar and Robert Cohn and George Cox and Thomas Gross and H. T. Kung and Monica Lam and Margie Levine and Brian Moore and Wire Moore and Craig Peterson and Jim Susman and Jim Sutton and John Urbanski and Jon Webb", title = "Supporting systolic and memory communication in {iWarp}", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "70--81", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Papadopoulos:1990:MET, author = "Gregory M. Papadopoulos and David E. Culler", title = "{Monsoon}: an explicit token-store architecture", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "82--91", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Annaratone:1990:KPP, author = "Marco Annaratone and Marco Fillo and Kiyoshi Nakabayashi and Marc Viredaz", title = "The {K2} parallel processor: architecture and hardware implementation", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "92--101", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Agarwal:1990:APA, author = "Anant Agarwal and Beng-Hong Lim and David Kranz and John Kubiatowicz", title = "{APRIL}: a processor architecture for multiprocessing", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "104--114", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bisiani:1990:PDS, author = "Roberto Bisiani and Mosur Ravishankar", title = "{PLUS}: a distributed shared-memory system", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "115--124", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bennett:1990:ASC, author = "John K. Bennett and John B. Carter and Willy Zwaenepoel", title = "Adaptive software cache management for distributed shared memory architectures", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "125--134", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ditzel:1990:BSV, author = "David R. Ditzel and John L. Hennessy and Bernie Rudin and Alan Jay Smith and Stephen L. Squires and Zeke Zalcstein", title = "Big science versus little science---do you have to build it? (panel session)", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "136--136", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{OKrafka:1990:EET, author = "Brian W. O'Krafka and A. Richard Newton", title = "An empirical evaluation of two memory-efficient directory methods", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "138--147", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lenoski:1990:DBC, author = "Daniel Lenoski and James Laudon and Kourosh Gharachorloo and Anoop Gupta and John Hennessy", title = "The directory-based cache coherence protocol for the {DASH} multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "148--159", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Przybylski:1990:PIB, author = "Steven Przybylski", title = "The performance impact of block sizes and fetch strategies", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "160--169", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Alpert:1990:PCL, author = "D. Alpert and A. Averbuch and O. Danieli", title = "Performance comparison of load\slash store and symmetric instruction set architectures", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "172--181", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Davidson:1990:RCB, author = "Jack W. Davidson and David B. Whalley", title = "Reducing the cost of branches by using registers", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "182--191", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Love:1990:ISV, author = "Carl E. Love and Harry F. Jordan", title = "An investigation of static versus dynamic scheduling", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "192--201", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bhandarkar:1990:VVA, author = "Dileep Bhandarkar and Richard Brunner", title = "{VAX} vector architecture", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "204--215", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Horst:1990:MII, author = "Robert W. Horst and Richard L. Harris and Robert L. Jardine", title = "Multiple instruction issue in the {NonStop Cyclone} processor", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "216--226", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thakkar:1990:POA, author = "Shreekant S. Thakkar and Mark Sweiger", title = "Performance of an {OLTP} application on symmetry multiprocessor system", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "228--238", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:1990:ISG, author = "Ding-Kai Chen and Hong-Men Su and Pen-Chung Yew", title = "The impact of synchronization and granularity on parallel systems", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "239--248", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bugge:1990:TDS, author = "H{\aa}kon O. Bugge and Ernst H. Kristiansen and Bj{\o}rn O. Bakka", title = "Trace-driven simulations for a two-level cache design in open bus systems", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "250--259", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hsu:1990:PMT, author = "Jiun-Ming Hsu and Prithviraj Banerjee", title = "Performance measurement and trace driven simulation of parallel {CAD} and numeric applications on a hypercube multicomputer", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "260--269", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Borg:1990:GAV, author = "Anita Borg and R. E. Kessler and David W. Wall", title = "Generation and analysis of very long address traces", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "270--279", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Holmer:1990:FPE, author = "Bruce K. Holmer and Barton Sano and Michael Carlton and Peter {Van Roy} and Ralph Haygood and William R. Bush and Alvin M. Despain and Joan M. Pendleton and Tep Dobry", title = "Fast {Prolog} with an extended general purpose architecture", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "282--291", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Alkalaj:1990:ASM, author = "Leon Alkalaj and Tom{\'a}s Lang and Milo{\v{s}} Ercegovac", title = "Architectural support for the management of tightly-coupled fine-grain goals in flat concurrent {Prolog}", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "292--301", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ho:1990:BAD, author = "Samuel Ho and Lawrence Snyder", title = "Balance in architectural design", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "302--310", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Reddy:1990:SBP, author = "A. L. Narasimha Reddy and Prithviraj Banerjee", title = "A study of {I/O} behavior of perfect benchmarks on a multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "312--321", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:1990:MPS, author = "Peter M. Chen and David A. Patterson", title = "Maximizing performance in a striped disk array", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "322--331", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shin:1990:DAH, author = "Kang G. Shin and Greg Dykema", title = "A distributed {I/O} architecture for {HARTS}", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "332--342", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smith:1990:BBS, author = "Michael D. Smith and Monica S. Lam and Mark A. Horowitz", title = "Boosting beyond static scheduling in a superscalar processor", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "344--354", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Taylor:1990:TSL, author = "George Taylor and Peter Davies and Michael Farmwald", title = "The {TLB} slice---a low-cost high-speed address translation mechanism", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "355--363", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jouppi:1990:IDM, author = "Norman P. Jouppi", title = "Improving direct-mapped cache performance by the addition of a small fully-associative cache and prefetch buffers", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "364--373", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Davidson:1990:BTO, author = "Edward S. Davidson and Gurindar S. Sohl and Joseph A. Fisher and Greg Grohoski and Yale Pratt and J. E. Smith and David R. Stiles", title = "Better than one operation per clock (panel): vectors, {VLIW}, and superscalar", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3a", pages = "376--376", month = jun, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Alverson:1990:TCS, author = "Robert Alverson and David Callahan and Daniel Cummings and Brian Koblenz and Allan Porterfield and Burton Smith", title = "The {Tera} computer system", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "1--6", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hwang:1990:ORB, author = "K. Hwang and M. Dubois and D. K. Panda and S. Rao and S. Shang and A. Uresin and W. Mao and H. Nair and M. Lytwyn and F. Hsieh and J. Liu and S. Mehrotra and C. M. Cheng", title = "{OMP}: a {RISC}-based multiprocessor using orthogonal-access memories and multiple spanning buses", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "7--22", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dai:1990:BAS, author = "Kechang Dai and Wolfgang K. Giloi", title = "A basic architecture supporting {LGDG} computation", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "23--33", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Min:1990:ECS, author = "Sang Lyul Min and Jean-Loup Baer and Hyoung-Joo Kim", title = "An efficient caching support for critical sections in large-scale shared-memory multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "34--47", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nagashima:1990:IFA, author = "Umpei Nagashima and Fumio Nishimoto and Takashi Shibata and Hiroshi Itoh and Minoru Gotoh", title = "An improvement of {I/O} function for auxiliary storage: parallel {I/O} for a large scale supercomputing", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "48--59", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tzeng:1990:AVH, author = "Nian-Feng Tzeng", title = "Analysis of a variant hypercube topology", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "60--70", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{vanderHouwen:1990:POS, author = "P. J. van der Houwen and B. P. Sommeijer", title = "Parallel {ODE} solvers", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "71--81", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dayde:1990:UPL, author = "M. J. Dayd{\'e} and I. S. Duff", title = "Use of parallel level 3 {BLAS} in {LU} factorization on three vector multiprocessors the {ALLIANT FX/80}, the {CRAY-2}, and the {IBM 3090 VF}", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "82--95", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Houstis:1990:ENS, author = "E. N. Houstis and J. R. Rice and N. P. Chrisochoides and H. C. Karathanasis and P. N. Papachiou and M. K. Samartzis and E. A. Vavalis and Ko Yang Wang and S. Weerawarana", title = "{//ELLPACK}: a numerical simulation programming environment for parallel {MIMD} machines", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "96--107", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Christara:1990:SCP, author = "Christina C. Christara", title = "{Schur} complement preconditioned conjugate gradient methods for spline collocation equations", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "108--120", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chung:1990:COP, author = "Kuo-Liang Chung and Ferng-Ching Lin and Wen-Chin Chen", title = "Cost-optimal parallel {B}-spline interpolations", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "121--131", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gallivan:1990:SGS, author = "K. Gallivan and A. Sameh and Z. Zlatev", title = "Solving general sparse linear systems using conjugate gradient-type methods", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "132--139", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yuba:1990:DCD, author = "Toshitsugu Yuba and Toshio Shimada and Yoshinori Yamaguchi and Kei Hiraki and Shuichi Sakai", title = "Dataflow computer development in {Japan}", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "140--147", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sarkar:1990:PPO, author = "Vivek Sarkar and David Cann", title = "{POSC}---a partitioning and optimizing {SISAL} compiler", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "148--164", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bodin:1990:LOH, author = "Fran{\c{c}}ois Bodin and Fran{\c{c}}ois Charot", title = "Loop optimization for horizontal microcoded machines", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "164--176", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tang:1990:CTD, author = "Peiyi Tang and Pen-Chung Yew and Chuan-Qi Zhu", title = "Compiler techniques for data synchronization in nested parallel loops", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "177--186", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hudak:1990:CTD, author = "David E. Hudak and Santosh G. Abraham", title = "Compiler techniques for data partitioning of sequentially iterated parallel loops", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "187--200", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Klappholz:1990:PAA, author = "David Klappholz and Kleanthis Psarris and Xiangyun Kong", title = "On the perfect accuracy of an approximate subscript analysis test", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "201--212", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Malony:1990:HBP, author = "Allen D. Malony and Daniel A. Reed", title = "A hardware-based performance monitor for the {Intel iPSC/2} hypercube", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "213--226", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dimpsey:1990:PDD, author = "R. T. Dimpsey and R. K. Iyer", title = "Performance degradation due to multiprogramming and system overheads in real workloads: case study on a shared memory multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "227--238", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Saad:1990:SBP, author = "Youcef Saad and Harry A. G. Wijshoff", title = "{SPARK}: a benchmark package for sparse computations", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "239--253", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cybenko:1990:SPE, author = "George Cybenko and Lyle Kipp and Lynn Pointer and David Kuck", title = "Supercomputer performance evaluation and the {Perfect Benchmarks}", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "254--266", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Noor:1990:SLS, author = "Ahmed K. Noor and Jeanne M. Peters", title = "Strategies for large-scale structural problems on high-performance computers", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "267--280", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zecca:1990:ECV, author = "V. Zecca and A. Kamel", title = "Elastodynamics on clustered vector multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "281--290", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Eijkhout:1990:IPP, author = "Victor Eijkhout", title = "Implementation of $5$-point\slash $9$-point multi-level methods on hypercube architectures", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "291--295", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:1990:SBV, author = "Philip C. Chen", title = "Supercomputer-based visualization systems used for analyzing output data of a numerical weather prediction model", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "296--309", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Takahashi:1990:PAW, author = "Yoshizo Takahashi and Shigetaka Sasaki", title = "Parallel automated wire-routing with a number of competing processors", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "310--317", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chan:1990:HAA, author = "Tony F. Chan", title = "Hierarchical algorithms and architectures for parallel scientific computing", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "318--329", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smith:1990:IDA, author = "Kevin Smith and Bill Appelbe and Kurt Stirewalt", title = "Incremental dependence analysis for interactive parallelization", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "330--341", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ruhl:1990:PFC, author = "Roland R{\"u}hl and Marco Annaratone", title = "Parallelization of {FORTRAN} code on distributed-memory parallel processors", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "342--353", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gornish:1990:CDD, author = "Edward H. Gornish and Elana D. Granston and Alexander V. Veidenbaum", title = "Compiler-directed data prefetching in multiprocessors with memory hierarchies", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "354--368", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gao:1990:TEF, author = "Guang R. Gao and Herbert H. J. Hum and Yue-Bong Wong", title = "Towards efficient fine-grain software pipelining", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "369--379", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Andre:1990:PSM, author = "Fran{\c{c}}oise Andr{\'e} and Jean-Louis Pazat and Henry Thomas", title = "{Pandore}: a system to manage data distribution", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "380--388", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fatoohi:1990:VPA, author = "Rod A. Fatoohi", title = "Vector performance analysis of the {NEC SX-2}", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "389--400", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bodin:1990:PEP, author = "Fran{\c{c}}ois Bodin and Daniel Windheiser and William Jalby and Daya Atapattu and Mannho Lee and Dennis Gannon", title = "Performance evaluation and prediction for parallel algorithms on the {BBN GP1000}", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "401--413", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Brochard:1990:DAH, author = "Luigi Brochard and Alex Freau", title = "Designing algorithms on hierarchical memory multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "414--427", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bucher:1990:ACM, author = "Ingrid Y. Bucher and Donald A. Calahan", title = "Access conflicts in multiprocessor memories queueing models and simulation studies", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "428--438", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Luque:1990:ITD, author = "Emilio Luque and Ana Ripoll and Porfidio Hern{\'a}ndez and Tom{\'a}s Margalef", title = "Impact of task duplication on static-scheduling performance in multiprocessor systems with variable execution-time tasks", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "439--446", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gerasoulis:1990:CTG, author = "Apostolos Gerasoulis and Sesh Venugopal and Tao Yang", title = "Clustering task graphs for message passing architectures", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "447--456", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Paalvast:1990:MPP, author = "Edwin M. Paalvast and Arjan J. van Gemund and Henk J. Sips", title = "A method for parallel program generation with an application to the {Booster} language", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "457--469", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tsoukarellas:1990:RTS, author = "M. A. Tsoukarellas and T. S. Papatheodorou", title = "A run time support system for multiprocessor machines", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "470--478", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hey:1990:STP, author = "Anthony J. G. Hey", title = "Supercomputing with transputers---past, present and future", journal = j-COMP-ARCH-NEWS, volume = "18", number = "3b", pages = "479--489", month = sep, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:03 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smith:1990:EA, author = "Burton Smith", title = "The end of architecture", journal = j-COMP-ARCH-NEWS, volume = "18", number = "4", pages = "10--17", month = dec, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hill:1990:WS, author = "Mark D. Hill", title = "What is scalability?", journal = j-COMP-ARCH-NEWS, volume = "18", number = "4", pages = "18--21", month = dec, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Laplante:1990:NSI, author = "P. A. Laplante", title = "A novel single instruction computer architecture", journal = j-COMP-ARCH-NEWS, volume = "18", number = "4", pages = "22--26", month = dec, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ginosar:1990:PAP, author = "Ran Ginosar and Nick Michell", title = "On the potential of asynchronous pipelined processors", journal = j-COMP-ARCH-NEWS, volume = "18", number = "4", pages = "27--34", month = dec, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Oyang:1990:EEA, author = "Yen-Jen Oyang and Chun-Hung Wen and Yu-Fen Chen and Shu-May Lin", title = "The effect of employing advanced branching mechanisms in superscalar processors", journal = j-COMP-ARCH-NEWS, volume = "18", number = "4", pages = "35--52", month = dec, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Deville:1990:LCU, author = "Yannick Deville", title = "A low-cost usage-based replacement algorithm for cache memories", journal = j-COMP-ARCH-NEWS, volume = "18", number = "4", pages = "52--58", month = dec, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gunther:1990:HSM, author = "Bernard K. Gunther", title = "A high speed mechanism for short branches", journal = j-COMP-ARCH-NEWS, volume = "18", number = "4", pages = "59--61", month = dec, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McLaughlin:1990:DFD, author = "Robert McLaughlin", title = "Design for fast {DSP} machine", journal = j-COMP-ARCH-NEWS, volume = "18", number = "4", pages = "62--66", month = dec, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Joerg:1990:SPN, author = "Werner B. Joerg", title = "A subclass of {Petri Nets} as design abstraction for parallel architectures", journal = j-COMP-ARCH-NEWS, volume = "18", number = "4", pages = "67--77", month = dec, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1990:UN, author = "Mark Thorson", title = "{Usenet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "18", number = "4", pages = "80--89", month = dec, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Langdon:1990:BRH, author = "Glen G. {Langdon, Jr.}", title = "Book review: {{\em Highly Parallel Computing\/}} by {George Almasi and Allan Gotlieb (Benjamin\slash Cummings, 1989)}", journal = j-COMP-ARCH-NEWS, volume = "18", number = "4", pages = "90--90", month = dec, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Langdon:1990:BRS, author = "Glen G. {Langdon, Jr.}", title = "Book review: {{\em Solving Problems on Concurrent Processors, Vol II: Software for Concurrent Processors\/}} by {I. Angus, G. Fox, J. Kim, and D. Walker (Prentice-Hall, 1990)}", journal = j-COMP-ARCH-NEWS, volume = "18", number = "4", pages = "90--91", month = dec, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dikotter:1990:BRD, author = "Marc Dikotter", title = "Book review: {{\em The Definition of Standard ML\/}} by {R. Milner, M. Torte, R. Harper}", journal = j-COMP-ARCH-NEWS, volume = "18", number = "4", pages = "91--91", month = dec, year = "1990", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Leighton:1991:SPS, author = "F. T. Leighton", title = "Selected Papers from the {Symposium on Parallel Algorithms and Architectures}", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "5--5", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ngai:1991:FAR, author = "John Y. Ngai and Charles L. Seitz", title = "A framework for adaptive routing in multicomputer networks", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "6--14", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Beigel:1991:PNI, author = "Richard Beigel and Clydel P. Kruskal", title = "Processor networks and interconnection networks without long wires (extended abstract)", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "15--24", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Annexstein:1991:FTH, author = "Fred Annexstein", title = "Fault tolerance in hypercube-derivative networks (preliminary version)", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "25--34", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fujimoto:1991:VTM, author = "Richard M. Fujimoto", title = "The {Virtual Time Machine}", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "35--44", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bilardi:1991:OVA, author = "Ginfranco Bilardi and Scot W. Hornick and Majid Sarrafzadeh", title = "Optimal {VLSI} architectures for multidimensional {DFT} (preliminary version)", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "45--52", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thomborson:1991:SIM, author = "Clark D. Thomborson and Belle W.-Y. Wei", title = "Systolic implementations of a move-to-front text compressor", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "53--60", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Knight:1991:TLL, author = "Thomas F. {Knight, Jr.}", title = "Technologies for low latency interconnection switches", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "61--68", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Herbordt:1991:MPA, author = "Martin C. Herbordt and Charles C. Weems and James C. Corbett", title = "Message-passing algorithms for a {SIMD} torus with coteries", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "69--78", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Konstantinidou:1991:CRP, author = "S. Konstantinidou and L. Snyder", title = "The chaos router: a practical application of randomization in network routing", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "79--88", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bruck:1991:RAE, author = "Jehoshua Bruck and Robert Cypher and Danny Soroker", title = "Running algorithms efficiently on faulty hypercubes (extended abstract)", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "89--96", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nishimura:1991:ASM, author = "Naomi Nishimura", title = "Asynchronous shared memory parallel computation (preliminary version)", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "97--105", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shand:1991:HSL, author = "M. Shand and P. Bertin and J. Vuillemin", title = "Hardware speedups in long integer multiplication", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "106--113", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thapar:1991:CCL, author = "Manu Thapar and Bruce Delagi", title = "Cache coherence for large scale shared memory multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "114--119", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Grabienski:1991:FFS, author = "Peter Grabienski", title = "{FLIP-FLOP}: a stack-oriented multiprocessing system", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "120--127", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Price:1991:TAD, author = "Camille C. Price", title = "Task allocation in data flow multiprocessors: an annotated bibliography", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "128--134", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Adams:1991:PPP, author = "Rod Adams and Gordon Steven", title = "A parallel pipelined processor with conditional instruction execution", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "135--142", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1991:UNa, author = "Mark Thorson", title = "{Usenet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "146--150", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hilton:1991:BRS, author = "Michael L. Hilton", title = "Book review: {{\em Systems Programming in Parallel Logic Languages\/}} by {Ian Foster (Prentice Hall, 1990)}", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "151--151", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anthony:1991:BRT, author = "Keith Anthony", title = "Book review: {{\em Technology Projection Modeling of Future Computer Systems\/}} by {Al Cutaia (Prentice-Hall, 1990)}", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "152--153", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Schneck:1991:BRO, author = "Paul B. Schneck", title = "Book review: {{\em Optimizing FORTRAN Programs\/}} by {C. F. Schofield (Halstead Press, 1989)}", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "153--154", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bernecky:1991:BRMa, author = "Robert Bernecky", title = "Book review: {{\em Multiprocessors\/}} by {Daniel Tabak (Prentice Hall, Englewood Cliffs, NJ)}", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "154--156", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bernecky:1991:BRMb, author = "Robert Bernecky", title = "Book review: {{\em Multiprocessor Performance\/}} by {Erol Gelenbe (J. Wiley \& Sons, Chichester, England)}", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "156--157", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fulcher:1991:BRN, author = "John Fulcher", title = "Book review: {{\em Neural Net Applications and Products\/}} by {Richard K. Miller, Terri C. Walker, and Anne M. Ryan (SEAl Technical Publications, 1990)}", journal = j-COMP-ARCH-NEWS, volume = "19", number = "1", pages = "157--158", month = mar, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wolfe:1991:VIS, author = "Andrew Wolfe and John P. Shen", title = "A variable instruction stream extension to the {VLIW} architecture", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "2--14", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Katevenis:1991:RBP, author = "Manolis Katevenis and Nestoras Tzartzanis", title = "Reducing the branch penalty by rearranging instructions in a double-width memory", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "15--27", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:1991:FPP, author = "Roland L. Lee and Alex Y. Kwok and Fay{\'e} A. Briggs", title = "The floating point performance of a superscalar {SPARC} processor", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "28--37", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Callahan:1991:SP, author = "David Callahan and Ken Kennedy and Allan Porterfield", title = "Software prefetching", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "40--52", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sohi:1991:HBD, author = "Gurindar S. Sohi and Manoj Franklin", title = "High-bandwidth data memory systems for superscalar processors", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "53--62", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lam:1991:CPO, author = "Monica D. Lam and Edward E. Rothberg and Michael E. Wolf", title = "The cache performance and optimizations of blocked algorithms", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "63--74", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mogul:1991:ECS, author = "Jeffrey C. Mogul and Anita Borg", title = "The effect of context switches on cache performance", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "75--84", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Keppel:1991:PIF, author = "David Keppel", title = "A portable interface for on-the-fly instruction space modification", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "86--95", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Appel:1991:VMP, author = "Andrew W. Appel and Kai Li", title = "Virtual memory primitives for user programs", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "96--107", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anderson:1991:IAO, author = "Thomas E. Anderson and Henry M. Levy and Brian N. Bershad and Edward D. Lazowska", title = "The interaction of architecture and operating system design", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "108--120", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bradlee:1991:IRA, author = "David G. Bradlee and Susan J. Eggers and Robert R. Henry", title = "Integrating register allocation and instruction scheduling for {RISCs}", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "122--131", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Benitez:1991:CGS, author = "Manuel E. Benitez and Jack W. Davidson", title = "Code generation for streaming: an access\slash execute mechanism", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "132--141", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bagrodia:1991:EIH, author = "Rajive Bagrodia and Sharad Mathur", title = "Efficient {Implementation} of high-level parallel programs", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "142--151", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mangione-Smith:1991:VRD, author = "William Mangione-Smith and Santosh G. Abraham and Edward S. Davidson", title = "Vector register design for polycyclic vector scheduling", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "154--163", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Culler:1991:FGP, author = "David E. Culler and Anurag Sah and Klaus E. Schauser and Thorsten von Eicken and John Wawrzynek", title = "Fine-grain parallelism with minimal hardware support: a compiler-controlled threaded abstract machine", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "164--175", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wall:1991:LIL, author = "David W. Wall", title = "Limits of instruction-level parallelism", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "176--188", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:1991:PCP, author = "Edward K. Lee and Randy H. Katz", title = "Performance consequences of parity placement in disk arrays", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "190--199", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cate:1991:CCC, author = "Vincent Cate and Thomas Gross", title = "Combining the concepts of compression and caching for a two-level filesystem", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "200--211", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bolosky:1991:NPT, author = "William J. Bolosky and Michael L. Scott and Robert P. Fitzgerald and Robert J. Fowler and Alan L. Cox", title = "{NUMA} policies and their relation to memory architecture", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "212--221", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chaiken:1991:LDS, author = "David Chaiken and John Kubiatowicz and Anant Agarwal", title = "{LimitLESS} directories: a scalable cache coherence scheme", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "224--234", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Min:1991:ECB, author = "Sang L. Min and Jong-Deok Choi", title = "An efficient cache-based access anomaly detection scheme", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "235--244", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gharachorloo:1991:PEM, author = "Kourosh Gharachorloo and Anoop Gupta and John Hennessy", title = "Performance evaluation of memory consistency models for shared-memory multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "245--257", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Freudenthal:1991:PCF, author = "Eric Freudenthal and Allan Gottlieb", title = "Process coordination with fetch-and-increment", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "260--268", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mellor-Crummey:1991:SC, author = "John M. Mellor-Crummey and Michael L. Scott", title = "Synchronization without contention", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "269--278", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Johnson:1991:CRB, author = "Douglas Johnson", title = "The case for a read barrier", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "279--287", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cmelik:1991:AMS, author = "Robert F. Cmelik and Shing I. Kong and David R. Ditzel and Edmund J. Kelly", title = "An analysis of {MIPS} and {SPARC} instruction set utilization on the {SPEC} benchmarks", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "290--302", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hall:1991:PCA, author = "C. Brian Hall and Kevin O'Brien", title = "Performance characteristics of architectural features of the {IBM RISC System\slash 6000}", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "303--309", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bhandarkar:1991:PAC, author = "Dileep Bhandarkar and Douglas W. Clark", title = "Performance from architecture: comparing a {RISC} and a {CISC} with similar hardware organization", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "310--319", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{DeMara:1991:SPA, author = "R. F. DeMara and D. I. Moldovan", title = "The {SNAP-1} parallel {AI} prototype", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "2--11", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tan:1991:GEN, author = "Wei Siong Tan and H. Russ and Cecil O. Alford", title = "{GT-EP}: a novel high-performance real-time architecture", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "13--21", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Higuchi:1991:IPA, author = "Tetsuya Higuchi and Tatsumi Furuya and Kenichi Handa and Naoto Takahashi and Hiroyasu Nishiyama and Akio Kokubu", title = "{IXM2}: a parallel associative processor", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "22--31", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kaeli:1991:BHT, author = "David R. Kaeli and Philip G. Emma", title = "Branch history table prediction of moving target branches due to subroutine returns", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "34--42", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Klaiber:1991:ASC, author = "Alexander C. Klaiber and Henry M. Levy", title = "An architecture for software-controlled data prefetching", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "43--53", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fu:1991:DPM, author = "John W. C. Fu and Janak H. Patel", title = "Data prefetching in multiprocessor vector cache memories", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "54--63", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Harper:1991:RMC, author = "D. T. {Harper III}", title = "Reducing memory contention in shared memory multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "66--73", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rau:1991:PRI, author = "B. Ramakrishna Rau", title = "Pseudo-randomly interleaved memory", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "74--83", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Li:1991:EMS, author = "Kai Li and Karin Petersen", title = "Evaluation of memory system extensions", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "84--93", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dowd:1991:HPI, author = "Patrick W. Dowd", title = "High performance interprocessor communication through optical wavelength division multiple access channels", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "96--105", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Landin:1991:RFI, author = "Anders Landin and Erik Hagersten and Seif Haridi", title = "Race-free interconnection networks and multiprocessor consistency", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "106--115", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lin:1991:DFM, author = "Xiaola Lin and Lionel M. Ni", title = "Deadlock-free multicast wormhole routing in multicomputer networks", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "116--125", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Farrens:1991:DBR, author = "Matthew Farrens and Arvin Park", title = "Dynamic base register caching: a technique for reducing address bus width", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "128--137", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Olukotun:1991:ICH, author = "O. A. Olukotun and T. N. Mudge and R. B. Brown", title = "Implementing a cache for a high-performance {GaAs} microprocessor", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "138--147", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kurian:1991:CPE, author = "Lizyamma Kurian and Paul T. Hulina and Lee D. Coraor and Dhamir N. Mannai", title = "Classification and performance evaluation of instruction buffering techniques", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "150--159", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nakajima:1991:OVS, author = "Masaitsu Nakajima and Hiraku Nakano and Yasuhiro Nakakura and Tadahiro Yoshida and Yoshiyuki Goi and Yuji Nakai and Reiji Segawa and Takeshi Kishida and Hiroshi Kadota", title = "{OHMEGA}: a {VLSI} superscalar processor architecture for numerical applications", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "160--168", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vajapeyam:1991:ESC, author = "Sriram Vajapeyam and Gurindar S. Sohi and Wei-Chung Hsu", title = "An empirical study of the {CRAY Y-MP} processor using the {Perfect Club} benchmarks", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "170--179", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stephens:1991:ILP, author = "Chriss Stephens and Bryce Cogswell and John Heinlein and Gregory Palmer and John P. Shen", title = "Instruction level profiling and evaluation of the {IBM\slash 6000}", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "180--189", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dimpsey:1991:PPT, author = "R. T. Dimpsey and R. K. Iyer", title = "Performance prediction and tuning on a multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "190--199", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Oehlrich:1991:PEC, author = "C. W. Oehlrich and A. Quick", title = "Performance evaluation of a communication system for transputer-networks based on monitored event traces", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "202--211", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Konstantinidou:1991:CRA, author = "S. Konstantinidou and L. Snyder", title = "Chaos router: architecture and performance", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "212--221", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shukla:1991:SPC, author = "Shridhar B. Shukla and Dharma P. Agrawal", title = "Scheduling pipelined communication in distributed memory multiprocessors for real-time applications", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "222--231", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Adve:1991:DDR, author = "Sarita V. Adve and Mark D. Hill and Barton P. Miller and Robert H. B. Netzer", title = "Detecting data races on weak memory systems", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "234--243", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Koldinger:1991:VTD, author = "Eric J. Koldinger and Susan J. Eggers and Henry M. Levy", title = "On the validity of trace-driven simulation for multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "244--253", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gupta:1991:CEL, author = "Anoop Gupta and John Hennessy and Kourosh Gharachorloo and Todd Mowry and Wolf-Dietrich Weber", title = "Comparative evaluation of latency reducing and tolerating techniques", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "254--263", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chang:1991:IAF, author = "Pohua P. Chang and Scott A. Mahlke and William Y. Chen and Nancy J. Warter and Wen-mei W. Hwu", title = "{IMPACT}: an architectural framework for multiple-instruction-issue processors", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "266--275", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Butler:1991:SIS, author = "Michael Butler and Tse-Yu Yeh and Yale Patt and Mitch Alsup and Hunter Scales and Michael Shebanow", title = "Single instruction stream parallelism is greater than two", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "276--286", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Melvin:1991:EFG, author = "Stephen Melvin and Yale Patt", title = "Exploiting fine-grained parallelism through a combination of hardware and software techniques", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "287--296", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Adve:1991:CHS, author = "Sarita V. Adve and Vikram S. Adve and Mark D. Hill and Mary K. Vernon", title = "Comparison of hardware and software cache coherence schemes", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "298--308", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Simoni:1991:MPL, author = "Richard Simoni and Mark Horowitz", title = "Modeling the performance of limited pointers directories for cache coherence", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "309--319", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Quammen:1991:FRM, author = "Donna J. Quammen and D. Richard Miller", title = "Flexible register management for sequential programs", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "320--329", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bradlee:1991:ERP, author = "David G. Bradlee and Susan J. Eggers and Robert R. Henry", title = "The effect on {RISC} performance of register set size and structure versus code generation strategy", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "330--339", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Papadopoulos:1991:MRV, author = "Gregory M. Papadopoulos and Kenneth R. Traub", title = "Multithreading: a revisionist view of dataflow architectures", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "342--351", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chiueh:1991:MTV, author = "Tzi-cker Chiueh", title = "Multi-threaded vectorization", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "352--361", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Farrens:1991:SAI, author = "Matthew K. Farrens and Andrew R. Pleszkun", title = "Strategies for achieving improved processor throughput", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "362--369", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kagimasa:1991:ASM, author = "Toyohiko Kagimasa and Kikuo Takahashi and Toshiaki Mori and Seiichi Yoshizumi", title = "Adaptive storage management for very large virtual\slash real storage systems", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "372--379", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hall:1991:VVA, author = "Judith S. Hall and Paul T. Robinson", title = "Virtualizing the {VAX} architecture", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "380--389", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Akella:1991:MMI, author = "Janaki Akella and Daniel P. Siewiorek", title = "Modeling and measurement of the impact of {Input\slash Output} on system performance", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "390--399", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wilson:1991:PSP, author = "Paul R. Wilson", title = "Pointer swizzling at page fault time: efficiently supporting huge address spaces on standard hardware", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "6--13", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kuga:1991:DDH, author = "Morihiro Kuga and Kazuaki Murakami and Shinji Tomita", title = "{DSNS} (dynamically-hazard-resolved statically-code-scheduled, nonuniform superscalar): yet another superscalar processor architecture", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "14--29", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ponder:1991:PVA, author = "Carl Ponder", title = "Performance variation across benchmark suites", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "30--36", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Conte:1991:BSB, author = "Thomas M. Conte and Wen-mei W. Hwu", title = "A brief survey of benchmark usage in the architecture community", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "37--44", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Morris:1991:CER, author = "Todd D. Morris and Edward F. Gehringer", title = "A cost-effective reliable multipath interconnection network", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "45--65", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Laplante:1991:ICB, author = "P. A. Laplante", title = "An improved conditional branching scheme for a single instruction computer architecture", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "66--68", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{DuBois:1991:DED, author = "Andrew J. DuBois and John Rasure", title = "Design and evaluation of a distributed asynchronous {VLSI} crossbar switch controller for a packet switched supercomputer network", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "69--79", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lass:1991:CCP, author = "Stanley E. Lass", title = "The compiler controlled pack cache and messaging", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "80--85", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ungerer:1991:MLP, author = "Theo Ungerer and Eberhard Zehendner", title = "A multi-level parallelism architecture", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "86--93", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Matthes:1991:HMO, author = "Wolfgang Matthes", title = "How many operation units are adequate?", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "94--108", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cunha:1991:AMM, author = "Alberto R. Cunha and Carlos N. Ribeiro and Jos{\'e} A. Marques", title = "The architecture of a memory management unit for object-oriented systems", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "109--116", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Matloff:1991:AAS, author = "Norman Matloff", title = "An argument against scalable cache coherency", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "117--123", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rodohan:1991:OAO, author = "D. P. Rodohan and R. J. Glover", title = "An overview of the {A} architecture for optimisation problems in a logic programming environment", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "124--131", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wray:1991:TSD, author = "Stuart C. Wray", title = "Time-sequenced {DMA} for multimedia computers", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "132--137", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ramamoorthy:1991:BMC, author = "Ganesh Ramamoorthy and Alok N. Choudhary", title = "A bibliography for multiprocessor cache memories", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "138--153", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smith:1991:SBC, author = "Alan Jay Smith", title = "Second bibliography on {Cache} memories", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "154--182", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1991:UNb, author = "Mark Thorson", title = "{Usenet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "19", number = "4", pages = "185--191", month = jun, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:06 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Patterson:1991:TGS, author = "David A. Patterson", title = "Towards guidelines for {SIGARCH} sponsored conferences", journal = j-COMP-ARCH-NEWS, volume = "19", number = "5", pages = "7--7", month = sep, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Maa:1991:TED, author = "Yeong-Chang Maa and Dhiraj K. Pradhan and Dominique Thi{\'e}baut", title = "Two economical directory schemes for large-scale cache coherent multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "19", number = "5", pages = "10--10", month = sep, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1991:UNc, author = "Mark Thorson", title = "{Usenet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "19", number = "5", pages = "21--26", month = sep, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ivanovic:1991:BRC, author = "Vladimir G. Ivanovic", title = "Book review: {{\em Computation Structures\/}} by {Stephen A Ward and Robert H. Halstead, Jr. (MIT Press or McGraw-Hill, 1990)}", journal = j-COMP-ARCH-NEWS, volume = "19", number = "5", pages = "27--29", month = sep, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Krieger:1991:BRM, author = "Moshe Krieger", title = "Book review: {{\em Multiprocessors\/}} by {D. Tabak (Prentice-Hall, 1990)}", journal = j-COMP-ARCH-NEWS, volume = "19", number = "5", pages = "27--29", month = sep, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fulcher:1991:BRM, author = "John Fulcher", title = "Book review: {{\em The 68000 and 68020 Microprocessors: Hardware, Software and Interfacing Techniques\/}} by {W. Triebel and A. Singh (Prentice Hall, 1991)}", journal = j-COMP-ARCH-NEWS, volume = "19", number = "5", pages = "29--30", month = sep, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Baker:1991:PIS, author = "Henry G. Baker", title = "Precise instruction scheduling without a precise machine model", journal = j-COMP-ARCH-NEWS, volume = "19", number = "6", pages = "4--8", month = dec, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McLaughlin:1991:LAB, author = "Robert McLaughlin", title = "Look-ahead branching hardware", journal = j-COMP-ARCH-NEWS, volume = "19", number = "6", pages = "9--11", month = dec, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Beth:1991:RCI, author = "Thomas Beth and Volker Hatz", title = "A restricted crossbar implementation and its applications", journal = j-COMP-ARCH-NEWS, volume = "19", number = "6", pages = "12--16", month = dec, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1991:UNd, author = "Mark Thorson", title = "{Usenet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "19", number = "6", pages = "19--23", month = dec, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bernecky:1991:BRP, author = "Robert Bernecky", title = "Book review: {{\em Past, Present, Parallel: A Survey of Available Parallel Computing Systems\/}} by {Arthur Trew \& Greg Wilson (Eds.), (Springer-Verlag 1991)}", journal = j-COMP-ARCH-NEWS, volume = "19", number = "6", pages = "24--25", month = dec, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:27 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Singh:1992:SSP, author = "Jaswinder Pal Singh and Wolf-Dietrich Weber and Anoop Gupta", title = "{SPLASH}: {Stanford} parallel applications for shared-memory", journal = j-COMP-ARCH-NEWS, volume = "20", number = "1", pages = "5--44", month = mar, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wajda:1992:SSP, author = "Eligiusz Wajda", title = "{SPIRE}: streaming processing with instructions release element", journal = j-COMP-ARCH-NEWS, volume = "20", number = "1", pages = "45--54", month = mar, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Deville:1992:CRP, author = "Yannick Deville and Jean Gobert", title = "A class of replacement policies for medium and high-associativity structures", journal = j-COMP-ARCH-NEWS, volume = "20", number = "1", pages = "55--64", month = mar, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zucker:1992:PSM, author = "Richard N. Zucker and Jean-Loup Baer", title = "A performance study of memory consistency models", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "2--12", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Keleher:1992:LRC, author = "Pete Keleher and Alan L. Cox and Willy Zwaenepoel", title = "Lazy release consistency for software distributed shared memory", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "13--21", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gharachorloo:1992:HML, author = "Kourosh Gharachorloo and Anoop Gupta and John Hennessy", title = "Hiding memory latency using dynamic scheduling in shared-memory multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "22--33", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fernandes:1992:EBB, author = "Edil S. T. Fernandes and Fernando M. B. Barbosa", title = "Effects of building blocks on the performance of super-scalar architecture", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "36--45", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lam:1992:LCF, author = "Monica S. Lam and Robert P. Wilson", title = "Limits of control flow on parallelism", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "46--57", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Franklin:1992:ESW, author = "Manoj Franklin and Gurindar S. Sohi", title = "The expandable split window paradigm for exploiting fine-grain parallelism", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "58--67", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Litaize:1992:TSM, author = "Daniel Litaize and Abdelaziz Mzoughi and Christine Rochange and Pascal Sainrat", title = "Towards a shared-memory massively parallel multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "70--79", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stenstrom:1992:CPE, author = "Per Stenstr{\"o}m and Truman Joe and Anoop Gupta", title = "Comparative performance evaluation of cache-coherent {NUMA} and {COMA} architectures", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "80--91", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lenoski:1992:DPI, author = "Daniel Lenoski and James Laudon and Truman Joe and David Nakahira and Luis Stevens and Anoop Gupta and John Hennessy", title = "The {DASH} prototype: implementation and performance", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "92--103", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Intrater:1992:PED, author = "Gideon Intrater and Ilan Spillinger", title = "Performance evaluation of a decoded instruction cache for variable instruction-length computers", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "106--113", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:1992:SBS, author = "J. Bradley Chen and Anita Borg and Norman P. Jouppi", title = "A simulation based study of {TLB} performance", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "114--123", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yeh:1992:AIT, author = "Tse-Yu Yeh and Yale N. Patt", title = "Alternative implementations of two-level adaptive branch prediction", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "124--134", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hirata:1992:EPA, author = "Hiroaki Hirata and Kozo Kimura and Satoshi Nagamine and Yoshiyuki Mochizuki and Akio Nishimura and Yoshimori Nakase and Teiji Nishizawa", title = "An elementary processor architecture with simultaneous instruction issuing from multiple threads", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "136--145", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sato:1992:TBP, author = "Mitsuhisa Sato and Yuetsu Kodama and Shuichi Sakai and Yoshinori Yamaguchi and Yasuhito Koumura", title = "Thread-based programming for the {EM-4} hybrid dataflow machine", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "146--155", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nikhil:1992:MMP, author = "R. S. Nikhil and G. M. Papadopoulos and Arvind", title = "{T}: a multithreaded massively parallel architecture", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "156--167", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dubnicki:1992:ABS, author = "Czarek Dubnicki and Thomas J. LeBlanc", title = "Adjustable block size coherent caches", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "170--180", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Olukotun:1992:POP, author = "Kunle Olukotun and Trevor Mudge and Richard Brown", title = "Performance optimization of pipelined primary cache", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "181--190", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McFarling:1992:CRD, author = "Scott McFarling", title = "Cache replacement with dynamic exclusion", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "191--200", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Keckler:1992:PCI, author = "Stephem W. Keckler and William J. Dally", title = "Processor coupling: integrating compile time and runtime scheduling for parallelism", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "202--213", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Boothe:1992:IMT, author = "Bob Boothe and Abhiram Ranade", title = "Improved multithreading techniques for hiding communication latency in multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "214--223", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{DeGloria:1992:ILP, author = "Alessandro {De Gloria} and Paolo Faraboschi", title = "Instruction-level parallelism in {Prolog}: analysis and architectural support", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "224--233", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kurian:1992:MLE, author = "Lizyamma Kurian and Paul T. Hulina and Lee D. Coraor", title = "Memory latency effects in decoupled architectures with a single data memory module", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "236--245", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Seznec:1992:IPS, author = "Andr{\'e} Seznec and Jacques Lenfant", title = "Interleaved parallel schemes: improving memory throughput on supercomputers", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "246--255", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{vonEicken:1992:AMM, author = "Thorsten von Eicken and David E. Culler and Seth Copen Goldstein and Klaus Erik Schauser", title = "Active messages: a mechanism for integrated communication and computation", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "256--266", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chien:1992:PAR, author = "Andrew A. Chien and Jae H. Kim", title = "Planar-adaptive routing: low-cost adaptive networks for multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "268--277", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Glass:1992:TMA, author = "Christopher J. Glass and Lionel M. Ni", title = "The turn model for adaptive routing", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "278--287", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shimizu:1992:LLM, author = "Toshiyuki Shimizu and Takeshi Horie and Hiroaki Ishihata", title = "Low-latency message communication support for the {AP1000}", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "288--297", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Aichinger:1992:FBP, author = "Barbara P. Aichinger", title = "{Futurebus+} as an {I/O} bus: profile {B}", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "300--307", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Reddy:1992:SSO, author = "A. L. Narasimha Reddy", title = "A study of {I/O} system organizations", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "308--317", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Menon:1992:CSA, author = "Jai Menon and Dick Mattson", title = "Comparison of sparing alternatives for disk arrays", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "318--329", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Siegle:1992:MPB, author = "Markus Siegle and Richard Hofmann", title = "Monitoring program behaviour on {SUPRENUM}", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "332--341", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Austin:1992:DDA, author = "Todd M. Austin and Gurindar S. Sohi", title = "Dynamic dependency analysis of ordinary programs", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "342--351", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Najjar:1992:ALL, author = "Walid A. Najjar and W. Marcus Miller and A. P. Wim B{\"o}hm", title = "An analysis of loop latency in dataflow execution", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "352--360", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yang:1992:NCD, author = "Qing Yang and Liping Wu Yang", title = "A novel cache design for vector processing", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "362--371", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Valero:1992:INS, author = "Mateo Valero and Tom{\'a}s Lang and Jos{\'e} M. Llaber{\'\i}a and Montse Peiron and Eduard Ayguad{\'e} and Juan J. Navarra", title = "Increasing the number of strides for conflict-free vector access", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "372--381", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wulf:1992:EWA, author = "Wm. A. Wulf", title = "Evaluation of the {WM} architecture", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "382--390", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Johnson:1992:ICL, author = "Kirk L. Johnson", title = "The impact of communication locality on large-scale multiprocessor performance", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "392--402", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Scott:1992:PSR, author = "Steven L. Scott and James R. Goodman and Mary K. Vernon", title = "Performance of the {SCI} ring", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "403--414", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Talluri:1992:TST, author = "Madhusudhan Talluri and Shing Kong and Mark D. Hill and David A. Patterson", title = "Tradeoffs in supporting two page sizes", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "415--424", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Louri:1992:PEO, author = "Ahmed Louri and Jongwhoa Na", title = "Parallel electro-optical rule-based system for fast execution of expert systems (abstract)", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "427--427", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Seznec:1992:OAF, author = "Andr{\'e} Seznec and Karl Courtel", title = "{OPAC} (abstract): a floating-point coprocessor dedicated to compute-bound kernels", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "427--427", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cheng:1992:TCB, author = "Der-Chung Cheng and Kanad Ghose", title = "The time-constrained barrier synchronizer and its applications in parallel systems (abstract)", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "428--428", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Louri:1992:NCD, author = "Ahmed Louri and Hongki Sung", title = "A new compiler-directed cache coherence scheme for shared memory multiprocessors with fast and parallel explicit invalidation (abstract)", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "428--428", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Singh:1992:AGP, author = "Gautam B. Singh", title = "Architecture of a graphics processor (abstract)", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "429--429", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yomtov:1992:PED, author = "Ruben Yomtov", title = "Performance evaluation of disk subsystems", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "429--429", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lai:1992:EBS, author = "Feipei Lai and Meng-chou Chang", title = "Enhancing boosting with semantic register in a superscalar processor (abstract)", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "430--430", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sklenar:1992:PUVa, author = "Ivan Sklenar", title = "Prefetch unit for vector operations on scalar computers (abstract)", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "430--430", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Newman:1992:MMSa, author = "Gary Newman", title = "Memory management support for tiled array organization (abstract)", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "431--431", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Uht:1992:DPI, author = "Augustus K. Uht and Darin B. Johnson", title = "Data path issues in a highly concurrent machine (abstract)", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "431--431", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fineberg:1992:SLT, author = "Samuel A. Fineberg and Thomas L. Casavant and Brent H. Pease", title = "Seamless --- a latency-tolerant {RISC}-based multiprocessor architecture (abstract)", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "432--432", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sayeed:1992:PMB, author = "M. A. Sayeed and M. Atiquzzaman", title = "Performance of multiple-bus multiprocessor under non-uniform memory reference model (abstract)", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "432--432", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kechadi:1992:PIV, author = "M. Tahar Kechadi and J-L. Dekeyser and Ph. Marquet and Ph. Preux", title = "Performance improvement for vector pipeline multiprocessor systems using a disordered execution model(abstract)", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "433--433", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Varma:1992:CPS, author = "Anujan Varma and Gunjan Sinha", title = "A class of prefetch schemes for on-chip data caches", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "433--433", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Abnous:1992:PBV, author = "Arthur Abnous and Nader Bagherzadeh", title = "Pipelining and bypassing in a {VLIW} processor (abstract)", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "434--434", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Prakash:1992:SAS, author = "Shiv Prakash and Alice C. Parker", title = "Synthesis of application-specific heterogeneous multiprocessor systems (abstract)", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "434--434", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Farrens:1992:PTL, author = "Matthew Farrens and Arvin Park and Rob Fanfelle and Pius Ng and Gary Tyson", title = "A partitioned translation lookaside buffer approach to reducing address bandwidth (abstract)", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "435--435", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Laudon:1992:AIT, author = "James Laudon and Anoop Gupta and Mark Horowitz", title = "Architectural and implementation tradeoffs in the design of multiple-context processors (abstract)", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "435--435", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Alleyne:1992:EDN, author = "Brian D. Alleyne and Isaac D. Scherson", title = "Expanded delta networks for very large parallel computers", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "436--436", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Singh:1992:IHB, author = "Jaswinder Pal Singh", title = "Implications of hierarchical {N-body} methods for multiprocessor architecture", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "436--436", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Michael:1992:DBC, author = "Wisam Michael", title = "Directory-based cache coherency protocol for a ring-connected multiprocessor-array", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "437--437", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wang:1992:RCD, author = "Wen-Hann Wang and Jim Quinlan and Konrad Lai", title = "Revisit the case for direct-mapped chaches: a case for two-way set-associative level-two caches", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "437--437", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Culler:1992:AMM, author = "David E. Culler and Michial Gunter and James C. Lee", title = "Analysis of multithreaded microprocessors under multiprogramming", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "438--438", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wittenbrink:1992:CWG, author = "C. M. Wittenbrink and A. K. Somani and C. H. Chen", title = "Cache write generate for high performance parallel processing", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "438--438", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Burkhardt:1992:ICA, author = "Walter H. Burkhardt and Stefan Rust", title = "Integrated computer architecture development system", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "439--439", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chevance:1992:EMM, author = "R. J. Chevance", title = "An evaluation methodology for microprocessor and system architecture", journal = j-COMP-ARCH-NEWS, volume = "20", number = "3", pages = "4--13", month = jun, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Laird:1992:CTC, author = "Michael Laird", title = "A comparison of three current superscalar designs", journal = j-COMP-ARCH-NEWS, volume = "20", number = "3", pages = "14--21", month = jun, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dongarra:1992:PVC, author = "Jack J. Dongarra", title = "Performance of various computers using standard linear equations software", journal = j-COMP-ARCH-NEWS, volume = "20", number = "3", pages = "22--44", month = jun, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Keown:1992:PHR, author = "William F. {Keown, Jr.} and Philip {Koopman, Jr.} and Aaron Collins", title = "Performance of the {HARRIS RTX 2000} stack architecture versus the {Sun 4 SPARC} and the {Sun 3 M68020} Architectures", journal = j-COMP-ARCH-NEWS, volume = "20", number = "3", pages = "45--52", month = jun, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1992:UNa, author = "Mark Thorson", title = "{Usenet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "20", number = "3", pages = "56--62", month = jun, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chalterjee:1992:BRI, author = "Siddhartha Chalterjee", title = "Book review: {{\em The Impact of Vector and Parallel Architectures on the Gaussian Elimination Algorithm\/}} by {Yves Robert (Manchester University Press and Halsted Press, 1991)}", journal = j-COMP-ARCH-NEWS, volume = "20", number = "3", pages = "63--64", month = jun, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Esponda:1992:GCR, author = "Margarita Esponda and Ra{\'u}l Rojas", title = "A graphical comparison of {RISC} processors", journal = j-COMP-ARCH-NEWS, volume = "20", number = "4", pages = "2--8", month = sep, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Matsui:1992:DRM, author = "Shogo Matsui", title = "Dynamic refresh method for dynamic {RAMs}", journal = j-COMP-ARCH-NEWS, volume = "20", number = "4", pages = "9--16", month = sep, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Park:1992:CRS, author = "Arvin Park and Ron Maeder", title = "Codes to reduce switching transients across {VLSI I/O} pins", journal = j-COMP-ARCH-NEWS, volume = "20", number = "4", pages = "17--21", month = sep, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Newman:1992:MMSb, author = "Gary Newman", title = "Memory management support for tiled array organization", journal = j-COMP-ARCH-NEWS, volume = "20", number = "4", pages = "22--30", month = sep, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sklenar:1992:PUVb, author = "Ivan Sklen{\'a}{\v{r}}", title = "Prefetch unit for vector operations on scalar computers", journal = j-COMP-ARCH-NEWS, volume = "20", number = "4", pages = "31--37", month = sep, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Malik:1992:ILP, author = "Nadeem Malik and Richard J. Eickemeyer and Stamatis Vassiliadis", title = "Instruction-level parallelism from execution interlock collapsing", journal = j-COMP-ARCH-NEWS, volume = "20", number = "4", pages = "38--43", month = sep, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vassiliadis:1992:ASO, author = "Stamatis Vassiliadis and Bart Blaner and Richard J. Eickemeyer", title = "On the attributes of the {SCISM} organization", journal = j-COMP-ARCH-NEWS, volume = "20", number = "4", pages = "44--53", month = sep, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1992:UNb, author = "Mark Thorson", title = "{Usenet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "20", number = "4", pages = "56--64", month = sep, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Allen:1992:BRC, author = "Ken Allen", title = "Book review: {{\em Computing with Parallel Architectures: T.Node\/}}, edited by {D. Gassilloud and J. C. Grossetie (Kluwer Academic Publishers 1991)}", journal = j-COMP-ARCH-NEWS, volume = "20", number = "4", pages = "65--66", month = sep, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Michael:1992:FMB, author = "Gavin Michael and Andrew Chien", title = "Future multicomputers: beyond minimalist multiprocessors?", journal = j-COMP-ARCH-NEWS, volume = "20", number = "5", pages = "6--12", month = dec, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kaushal:1992:CHH, author = "R. P. Kaushal and J. S. Bedi", title = "Comparison of hypercube, hypernet, and symmetric hypernet architectures", journal = j-COMP-ARCH-NEWS, volume = "20", number = "5", pages = "13--25", month = dec, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1992:UNc, author = "Mark Thorson", title = "{Usenet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "20", number = "5", pages = "28--33", month = dec, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Levy:1992:BRN, author = "David Levy", title = "Book review: {{\em Neural Networks and Fuzzy Systems: A Dynamical Systems Approach to Machine Intelligence\/}} by {Bart Kosko (Prentice Hall 1992)}", journal = j-COMP-ARCH-NEWS, volume = "20", number = "5", pages = "34--34", month = dec, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Inoue:1993:PEV, author = "Atsushi Inoue and Kenji Takeda", title = "Performance evaluation for various configuration of superscalar processors", journal = j-COMP-ARCH-NEWS, volume = "21", number = "1", pages = "4--11", month = mar, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Uht:1993:EMIa, author = "Augustus K. Uht", title = "Extraction of massive instruction level parallelism", journal = j-COMP-ARCH-NEWS, volume = "21", number = "1", pages = "12--14", month = mar, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ullah:1993:MIP, author = "Nasr Ullah and Matt Holle", title = "The {MC88110} implementation of precise exceptions in a superscalar architecture", journal = j-COMP-ARCH-NEWS, volume = "21", number = "1", pages = "15--25", month = mar, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Deville:1993:PDP, author = "Yannick Deville", title = "A process-dependent partitioning strategy for cache memories", journal = j-COMP-ARCH-NEWS, volume = "21", number = "1", pages = "26--33", month = mar, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1993:UNa, author = "Mark Thorson", title = "{Usenet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "21", number = "1", pages = "36--38", month = mar, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Staff:1993:BR, author = "{ACM SIGARCH Computer Architecture News Staff}", title = "Book reviews", journal = j-COMP-ARCH-NEWS, volume = "21", number = "1", pages = "39--39", month = mar, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:33 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cypher:1993:ARP, author = "R. Cypher and A. Ho and S. Konstantinidou and P. Messina", title = "Architectural requirements of parallel scientific applications with explicit communication", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "2--13", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rothberg:1993:WSC, author = "Edward Rothberg and Jaswinder Pal Singh and Anoop Gupta", title = "Working sets, cache sizes, and node granularity issues for large-scale multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "14--26", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nagle:1993:DTS, author = "David Nagle and Richard Uhlig and Tim Stanley and Stuart Sechrest and Trevor Mudge and Richard Brown", title = "Design tradeoffs for software-managed {TLBs}", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "27--38", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Huck:1993:AST, author = "Jerry Huck and Jim Hays", title = "Architectural support for translation table management in large address space machines", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "39--50", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cao:1993:TPR, author = "Pei Cao and Swee Boon Lim and Shivakumar Venkataraman and John Wilkes", title = "The {TickerTAIP} parallel {RAID} architecture", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "52--63", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stodolsky:1993:PLO, author = "Daniel Stodolsky and Garth Gibson and Mark Holland", title = "Parity logging overcoming the small write problem in redundant disk arrays", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "64--75", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Menon:1993:AFT, author = "Jai Menon and Jim Cortney", title = "The architecture of a fault-tolerant cached {RAID} controller", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "76--87", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dubois:1993:DEU, author = "Michel Dubois and Jonas Skeppstedt and Livio Ricciulli and Krishnan Ramamurthy and Per Stenstr{\"o}m", title = "The detection and elimination of useless misses in multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "88--97", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cox:1993:ACC, author = "Alan L. Cox and Robert J. Fowler", title = "Adaptive cache coherency for detecting migratory shared data", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "98--108", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stenstrom:1993:ACC, author = "Per Stenstr{\"o}m and Mats Brorsson and Lars Sandberg", title = "An adaptive cache coherence protocol optimized for migratory sharing", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "109--118", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Waldspurger:1993:RRF, author = "Carl A. Waldspurger and William E. Weihl", title = "Register relocation: flexible contexts for multithreading", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "120--130", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hidaka:1993:MTC, author = "Yasuo Hidaka and Hanpei Koike and Hidehiko Tanaka", title = "Multiple threads in cyclic register windows", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "131--142", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dwarkadas:1993:ERC, author = "Sandhya Dwarkadas and Peter Keleher and Alan L. Cox and Willy Zwaenepoel", title = "Evaluation of release consistent software distributed shared memory on emerging network technology", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "144--155", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wood:1993:MCS, author = "David A. Wood and Satish Chandra and Babak Falsafi and Mark D. Hill and James R. Larus and Alvin R. Lebeck and James C. Lewis and Shubhendu S. Mukherjee and Subbarao Palacharla and Steven K. Reinhardt", title = "Mechanisms for cooperative shared memory", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "156--167", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Seznec:1993:CTW, author = "Andr{\'e} Seznec", title = "A case for two-way skewed-associative caches", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "169--178", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Agarwal:1993:CAC, author = "Anant Agarwal and Stephen D. Pudar", title = "Column-associative caches: a technique for reducing the miss rate of direct-mapped caches", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "179--190", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jouppi:1993:CWP, author = "Norman P. Jouppi", title = "Cache write policies and performance", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "191--201", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Boyd:1993:HPM, author = "Eric L. Boyd and Edward S. Davidson", title = "Hierarchical performance modeling with {MACS}: a case study of the {Convex C-240}", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "203--210", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kuck:1993:CSI, author = "D. Kuck and E. Davidson and D. Lawrie and A. Sameh and C. Q. Zhu and A. Veidenbaum and J. Konicek and P. Yew and K. Gallivan and W. Jalby and H. Wijshoff and R. Bramley and U. M. Yang and P. Emrath and D. Padua and R. Eigenmann and J. Hoeflinger and G. Jaxon and Z. Li and T. Murphy and J. Andrews", title = "The cedar system and an initial performance study", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "213--223", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Noakes:1993:JMM, author = "Michael D. Noakes and Deborah A. Wallach and William J. Dally", title = "The {J-machine} multicomputer: an architectural evaluation", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "224--235", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bunda:1993:BVB, author = "John Bunda and Don Fussell and W. C. Athas and Roy Jenevein", title = "16-bit vs. 32-bit instructions for pipelined microprocessors", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "237--246", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kiyohara:1993:RCN, author = "Tokuzo Kiyohara and Scott Mahlke and William Chen and Roger Bringmann and Richard Hank and Sadun Anik and Wen-Mei Hwu", title = "Register connection: a new approach to adding registers into instruction set architectures", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "247--256", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yeh:1993:CDB, author = "Tse-Yu Yeh and Yale N. Patt", title = "A comparison of dynamic branch predictors that use two levels of branch history", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "257--266", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Barroso:1993:PCC, author = "Luis Andr{\'e} Barroso and Michel Dubois", title = "The performance of cache-coherent ring-based multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "268--277", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tullsen:1993:LCP, author = "Dean M. Tullsen and Susan J. Eggers", title = "Limitations of cache prefetching on a bus-based multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "278--288", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Herlihy:1993:TMA, author = "Maurice Herlihy and J. Eliot B. Moss", title = "Transactional memory: architectural support for lock-free data structures", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "289--300", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Spertus:1993:EMF, author = "Ellen Spertus and Seth Copen Goldstein and Klaus Erik Schauser and Thorsten von Eicken and David E. Culler and William J. Dally", title = "Evaluation of mechanisms for fine-grained parallel programs in the {J-machine} and the {CM-5}", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "302--313", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Horie:1993:IAP, author = "Takeshi Horie and Kenichi Hayashi and Toshiyuki Shimizu and Hiroaki Ishihata", title = "Improving {AP1000} parallel computer performance with message communication", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "314--325", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hsu:1993:PCD, author = "W.-C. Hsu and J. E. Smith", title = "Performance of cached {DRAM} organizations in vector supercomputers", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "327--336", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gao:1993:CRT, author = "Q. S. Gao", title = "The {Chinese} remainder theorem and the prime memory system", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "337--340", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Seznec:1993:OMS, author = "Andr{\'e} Seznec and Jacques Lenfant", title = "Odd memory systems may be quite interesting", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "341--350", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Boppana:1993:CAW, author = "Rajendra V. Boppana and Suresh Chalasani", title = "A comparison of adaptive wormhole routing algorithms", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "351--360", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Uht:1993:EMIb, author = "Augustus K. Uht", title = "Extraction of massive instruction level parallelism", journal = j-COMP-ARCH-NEWS, volume = "21", number = "3", pages = "5--12", month = jun, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ramanathan:1993:SCP, author = "Gowri Ramanathan and Joel Oren", title = "Survey of commercial parallel machines", journal = j-COMP-ARCH-NEWS, volume = "21", number = "3", pages = "13--33", month = jun, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ewy:1993:SCP, author = "Benjamin J. Ewy and Joseph B. Evans", title = "Secondary cache performance in {RISC} architecture", journal = j-COMP-ARCH-NEWS, volume = "21", number = "3", pages = "34--37", month = jun, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Danesh:1993:PLC, author = "Iraj Danesh", title = "Physical limitations of a computer", journal = j-COMP-ARCH-NEWS, volume = "21", number = "3", pages = "40--45", month = jun, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1993:UNb, author = "Mark Thorson", title = "{Usenet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "21", number = "3", pages = "46--49", month = jun, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fostel:1993:BRP, author = "Gary Fostel", title = "Book Reviews: {{\em Principles of Computer Systems\/}} by {Gerald M. Karam \& John C. Bryant (Prentice Hall 1992)}", journal = j-COMP-ARCH-NEWS, volume = "21", number = "3", pages = "50--51", month = jun, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fostel:1993:BRC, author = "Gary Fostel", title = "Book Review: {{\em Computer Architecture\/}} by {Mario De Blasi (Addison-Wesley Publishing Company, 1990)}", journal = j-COMP-ARCH-NEWS, volume = "21", number = "3", pages = "51--53", month = jun, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fulcher:1993:BRP, author = "John Fulcher", title = "Book Review: {{\em Practical Parallel Computing\/}} by {Paul Messina and Almerico Murli, Editors (John Wiley and Sons, 1992)}", journal = j-COMP-ARCH-NEWS, volume = "21", number = "3", pages = "53--54", month = jun, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hill:1993:WAR, author = "Mark D. Hill and James R. Larus and Alvin R. Lebeck and Madhusudhan Talluri and David A. Wood", title = "{Wisconsin Architectural Research Tool Set}", journal = j-COMP-ARCH-NEWS, volume = "21", number = "4", pages = "8--10", month = sep, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hyatt:1993:HPO, author = "Craig Hyatt", title = "A high-performance object-oriented memory", journal = j-COMP-ARCH-NEWS, volume = "21", number = "4", pages = "11--19", month = sep, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dewan:1993:CUM, author = "Gautam Dewan and V. S. S. Nair", title = "A case for uniform memory access multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "21", number = "4", pages = "20--26", month = sep, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1993:UNc, author = "Mark Thorson", title = "{Usenet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "21", number = "4", pages = "27--28", month = sep, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Langdon:1993:BR, author = "Glen Langdon", title = "Book Reviews", journal = j-COMP-ARCH-NEWS, volume = "21", number = "4", pages = "29--29", month = sep, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jain:1993:ISI, author = "Ravi Jain and John Werth and J. C. Browne", title = "Introduction to the {Special Issue on Input\slash Output in Parallel Computer Systems}", journal = j-COMP-ARCH-NEWS, volume = "21", number = "5", pages = "5--6", month = dec, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Corbett:1993:OVP, author = "Peter F. Corbett and Sandra Johnson Baylor and Dror G. Feitelson", title = "Overview of the {Vesta} parallel file system", journal = j-COMP-ARCH-NEWS, volume = "21", number = "5", pages = "7--14", month = dec, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lin:1993:PIA, author = "Z. Lin and S. Zhou", title = "Parallelizing {I/O} intensive applications for a workstation cluster: a case study", journal = j-COMP-ARCH-NEWS, volume = "21", number = "5", pages = "15--22", month = dec, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fineberg:1993:INA, author = "Samuel A. Fineberg", title = "Implementing the {NHT-1} application {I/O} benchmark", journal = j-COMP-ARCH-NEWS, volume = "21", number = "5", pages = "23--30", month = dec, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{delRosario:1993:IPT, author = "Juan Miguel del Rosario and Rajesh Bordawekar and Alok Choudhary", title = "Improved parallel {I/O} via a two-phase run-time access strategy", journal = j-COMP-ARCH-NEWS, volume = "21", number = "5", pages = "31--38", month = dec, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ghandeharizadeh:1993:OTS, author = "Shahram Ghandeharizadeh and Cyrus Shahabi and Luis Ramos", title = "An overview of techniques to support continuous retrieval of multimedia objects", journal = j-COMP-ARCH-NEWS, volume = "21", number = "5", pages = "39--46", month = dec, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jain:1993:SPO, author = "Ravi Jain and Kiran Somalwar and John Werth and J. C. Browne", title = "Scheduling parallel {I/O} operations", journal = j-COMP-ARCH-NEWS, volume = "21", number = "5", pages = "47--54", month = dec, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Li:1993:TTF, author = "Qiang Li and Naphtali Rishe", title = "A transputer {T9000} family based architecture for parallel database machines", journal = j-COMP-ARCH-NEWS, volume = "21", number = "5", pages = "55--62", month = dec, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Assmann:1993:RPA, author = "Claus A{\ss}mann", title = "A {RISC} processor architecture with a versatile stack system", journal = j-COMP-ARCH-NEWS, volume = "21", number = "5", pages = "63--70", month = dec, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wang:1993:NDH, author = "Dajin Wang", title = "A note on {``Diagnosabilities of hypercubes under the pessimistic one-step diagnosis strategy''}", journal = j-COMP-ARCH-NEWS, volume = "21", number = "5", pages = "71--78", month = dec, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1993:UNd, author = "Mark Thorson", title = "{Usenet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "21", number = "5", pages = "79--85", month = dec, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Alverson:1993:BRH, author = "Bob Alverson", title = "Book Review: {{\em High-Speed Digital Design: A Handbook of Black Magic\/}} by {Howard W. Johnson and Martin Graham (Prentice-Hall, 1993)}", journal = j-COMP-ARCH-NEWS, volume = "21", number = "5", pages = "85--86", month = dec, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:19 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Iannucci:1994:AII, author = "Robert Iannucci and Anant Agarwal and Bill Dally and Anoop Gupta and Greg Papadopoulos and Burton Smith", title = "Architectural and implementation issues for multithreading (panel session {I})", journal = j-COMP-ARCH-NEWS, volume = "22", number = "1", pages = "3--18", month = mar, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Halstead:1994:PCR, author = "Burt Halstead and David Callahan and Jack Dennis and R. S. Nikhil and Vivek Sarkar", title = "Programming, compilation, and resource management issues for multithreading (panel session {II})", journal = j-COMP-ARCH-NEWS, volume = "22", number = "1", pages = "19--33", month = mar, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Baker:1994:LLP, author = "Henry G. Baker", title = "Linear logic and permutation stacks---the {Forth} shall be first", journal = j-COMP-ARCH-NEWS, volume = "22", number = "1", pages = "34--43", month = mar, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mendlson:1994:CTI, author = "Abraham Mendlson and Shlomit S. Pinter and Ruth Shtokhamer", title = "Compile time instruction cache optimizations", journal = j-COMP-ARCH-NEWS, volume = "22", number = "1", pages = "44--51", month = mar, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Barach:1994:HVF, author = "David Barach and Jaspal Kohli and John Slice and Marc Spaulding and Rajeev Bharadhwaj and Don Hudson and Cliff Neighbors and Nirmal Saxena and Rolland Crunk", title = "{HALSIM}---a very fast {SPARC V9} behavioral model", journal = j-COMP-ARCH-NEWS, volume = "22", number = "1", pages = "52--58", month = mar, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1994:UNa, author = "Mark Thorson", title = "{Usenet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "22", number = "1", pages = "59--60", month = mar, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Madruga:1994:BRS, author = "Ewerton Longoni Madruga", title = "Book Review: {{\em SNMP, SNMPv2, and CMIP: The Practical Guide to Network Management Standards\/}} by {William Stallings (Addison-Wesley Publishing Company Inc. 1993)}", journal = j-COMP-ARCH-NEWS, volume = "22", number = "1", pages = "60--61", month = mar, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Calder:1994:FAI, author = "B. Calder and D. Grunwald", title = "Fast and accurate instruction fetch and branch prediction", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "2--11", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Talcott:1994:IUB, author = "A. R. Talcott and W. Yamamoto and M. J. Serrano and R. C. Wood and M. Nemirovsky", title = "The impact of unresolved branches on branch prediction scheme performance", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "12--21", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Palacharla:1994:ESB, author = "S. Palacharla and R. E. Kessler", title = "Evaluating stream buffers as a secondary cache replacement", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "24--33", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jouppi:1994:TTL, author = "N. P. Jouppi and S. J. E. Wilton", title = "Tradeoffs in two-level on-chip caching", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "34--45", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Singhal:1994:ASP, author = "A. Singhal and A. J. Goldberg", title = "Architectural support for performance tuning: a case study on the {SPARCcenter 2000}", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "48--59", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cvetanovic:1994:CAA, author = "Z. Cvetanovic and D. Bhandarkar", title = "Characterization of {Alpha AXP} performance using {TP} and {SPEC} workloads", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "60--70", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Natarajan:1994:MBC, author = "C. Natarajan and S. Sharma and R. K. Iyer", title = "Measurement-based characterization of global memory and network contention, operating system and parallelization overheads", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "71--80", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Joe:1994:EMO, author = "T. Joe and J. L. Hennessy", title = "Evaluating the memory overhead required for {COMA} architectures", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "82--93", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Klaiber:1994:CMP, author = "A. C. Klaiber and H. M. Levy", title = "A comparison of message passing and shared memory architectures for data parallel programs", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "94--105", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cox:1994:SVH, author = "A. L. Cox and S. Dwarkadas and P. Keleher and H. Lu and R. Rajamony and W. Zwaenepoel", title = "Software versus hardware shared-memory implementation: a case study", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "106--117", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pnevmatikatos:1994:GEB, author = "D. N. Pnevmatikatos and G. S. Sohi", title = "Guarded execution and branch prediction in dynamic {ILP} processors", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "120--129", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Su:1994:BMS, author = "C.-L Su and A. M. Despain", title = "Branch with masked squashing in superpipelined processors", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "130--140", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Blumrich:1994:VMM, author = "M. A. Blumrich and K. Li and R. Alpert and C. Dubnicki and E. W. Felten and J. Sandberg", title = "Virtual memory mapped network interface for the {SHRIMP} multicomputer", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "142--153", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Steenkiste:1994:AEH, author = "P. Steenkiste and M. Hemy and T. Mummert and B. Zill", title = "Architecture and evaluation of a high-speed networking subsystem for distributed-memory systems", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "154--163", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nayfeh:1994:EDS, author = "B. A. Nayfeh and K. Olukotun", title = "Exploring the design space for a shared-cache multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "166--175", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thekkath:1994:ISB, author = "R. Thekkath and S. J. Eggers", title = "Impact of sharing-based thread placement on multithreaded architectures", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "176--186", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dahlgren:1994:CPG, author = "F. Dahlgren and M. Dubois and P. Stenstr{\"o}m", title = "Combined performance gains of simple cache protocol extensions", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "187--197", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Huang:1994:SDC, author = "A. S. Huang and G. Slavenburg and J. P. Shen", title = "Speculative disambiguation: a compilation technique for dynamic memory disambiguation", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "200--210", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Farkas:1994:CPT, author = "K. I. Farkas and N. P. Jouppi", title = "Complexity\slash performance tradeoffs with non-blocking loads", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "211--222", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:1994:PSS, author = "T.-F. Chen and J.-L. Baer", title = "A performance study of software and hardware data prefetching schemes", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "223--232", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Drapeau:1994:RIH, author = "A. L. Drapeau and K. W. Shirriff and J. H. Hartman and E. L. Miller and S. Seshan and R. H. Katz and K. Lutz and D. A. Patterson and E. K. Lee and P. M. Chen and G. A. Gibson", title = "{RAID-II}: a high-bandwidth network file server", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "234--244", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Blaum:1994:EOS, author = "M. Blaum and J. Brady and J. Bruck and J. Menon", title = "{EVENODD}: an optimal scheme for tolerating double disk failures in {RAID} architectures", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "245--254", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ng:1994:CDA, author = "S. W. Ng", title = "Crosshatch disk array for improved reliability and performance", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "255--264", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{DeHon:1994:MRA, author = "A. DeHon and F. Chong and M. Becker and E. Egozy and H. Minsky and S. Peretz and T. F. {Knight, Jr.}", title = "{METRO}: a router architecture for high-performance, short-haul routing networks", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "266--277", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Allen:1994:AAR, author = "J. D. Allen and P. T. Gaughan and D. E. Schimmel and S. Yalamanchili", title = "{Ariadne}---an adaptive router for fault-tolerant multicomputers", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "278--288", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kim:1994:CRF, author = "J. H. Kim and Z. Liu and A. A. Chien", title = "Compressionless routing: a framework for adaptive and fault-tolerant routing", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "289--300", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kuskin:1994:SFM, author = "J. Kuskin and D. Ofelt and M. Heinrich and J. Heinlein and R. Simoni and K. Gharachorloo and J. Chapin and D. Nakahira and J. Baxter and M. Horowitz and A. Gupta and M. Rosenblum and J. Hennessy", title = "The {Stanford FLASH} multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "302--313", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chaiken:1994:SEC, author = "D. Chaiken and A. Agarwal", title = "Software-extended coherent shared memory: performance and cost", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "314--324", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Reinhardt:1994:TTU, author = "S. K. Reinhardt and J. R. Larus and D. A. Wood", title = "{Tempest} and {Typhoon}: user-level shared memory", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "325--336", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Farrens:1994:SSC, author = "M. Farrens and G. Tyson and A. R. Pleszkun", title = "A study of single-chip processor\slash cache organizations for large numbers of transistors", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "338--347", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:1994:UAT, author = "C.-H. Chen and A. K. Somani", title = "A unified architectural tradeoff methodology", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "348--357", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nagle:1994:OAC, author = "D. Nagle and R. Uhlig and T. Mudge and S. Sechrest", title = "Optimal allocation of on-chip memory for multiple-{API} operating systems", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "358--369", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Quong:1994:ECM, author = "R. W. Quong", title = "Expected {I-cache} miss rates via the gap model", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "372--383", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Seznec:1994:DSC, author = "A. Seznec", title = "Decoupled sectored caches: conciliating low tag implementation cost", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "384--393", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gurd:1994:SBB, author = "J. R. Gurd", title = "Supercomputing: big bang or steady state growth?", journal = j-COMP-ARCH-NEWS, volume = "22", number = "3", pages = "3--13", month = jun, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Litchfield:1994:IES, author = "Kay P. Litchfield", title = "Instruction execution sequence confirmation", journal = j-COMP-ARCH-NEWS, volume = "22", number = "3", pages = "14--18", month = jun, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Allen:1994:RWR, author = "Phil Allen and Franc Brglez and Hal Carter and Robert Caverly and Jerry Dillion and Albert Lo and Ron Lomax and John Oldfield and Cesar Pina and T. J. Wilkinson", title = "Report of the {1993 Workshop on Rapid Prototyping of Microelectronic Systems for Universities}", journal = j-COMP-ARCH-NEWS, volume = "22", number = "3", pages = "19--26", month = jun, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1994:UNb, author = "Mark Thorson", title = "{Usenet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "22", number = "3", pages = "27--28", month = jun, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Madruga:1994:BRI, author = "Ewerton Longoni Madruga", title = "Book Review: {{\em Internetworking with TCP/IP, vol. III: Client-Server programming and applications (BSD Sockets version)\/}} by {Douglas E. Comer and David L. Stevens (Prentice-Hall, 1993)}", journal = j-COMP-ARCH-NEWS, volume = "22", number = "3", pages = "29--30", month = jun, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jain:1994:SII, author = "Ravi Jain and John Werth and J. C. Browne", title = "{Special Issue on Input\slash Output in Parallel Computer Systems}: {Introduction}", journal = j-COMP-ARCH-NEWS, volume = "22", number = "4", pages = "3--4", month = sep, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Baylor:1994:PEM, author = "Sandra Johnson Baylor and Caroline Benveniste and Yarsun Hsu", title = "Performance evaluation of a massively parallel {I/O} subsystem", journal = j-COMP-ARCH-NEWS, volume = "22", number = "4", pages = "5--10", month = sep, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sinclair:1994:IPS, author = "James B. Sinclair and Jay Tang and Peter J. Varman", title = "Instability in parallel {I/O} systems", journal = j-COMP-ARCH-NEWS, volume = "22", number = "4", pages = "11--16", month = sep, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vanderleest:1994:MBC, author = "Steven H. Vanderleest and Ravishankar K. Iyer", title = "Measurement of {I/O} bus contention and correlation among heterogeneous device types in a single-bus multiprocessor system", journal = j-COMP-ARCH-NEWS, volume = "22", number = "4", pages = "17--22", month = sep, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thakur:1994:CCD, author = "Rajeev Thakur and Rajesh Bordawekar and Alok Choudhary", title = "Compilation of out-of-core data parallel programs for distributed memory machines", journal = j-COMP-ARCH-NEWS, volume = "22", number = "4", pages = "23--28", month = sep, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Asthana:1994:EAM, author = "Abhaya Asthana and Mark Cravatts and Paul Krzyzanowski", title = "An experimental active memory based {I/O} subsystem", journal = j-COMP-ARCH-NEWS, volume = "22", number = "4", pages = "29--34", month = sep, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Durand:1994:DSA, author = "Dannie Durand and Ravi Jain and David Tseytlin", title = "Distributed scheduling algorithms to improve the performance of parallel data transfers", journal = j-COMP-ARCH-NEWS, volume = "22", number = "4", pages = "35--40", month = sep, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yokota:1994:DND, author = "Haruo Yokota", title = "{DR-nets}: data-reconstruction networks for highly reliable parallel-disk systems", journal = j-COMP-ARCH-NEWS, volume = "22", number = "4", pages = "41--46", month = sep, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Forsell:1994:MMPa, author = "Martti J. Forsell", title = "Are multiport memories physically feasible?", journal = j-COMP-ARCH-NEWS, volume = "22", number = "4", pages = "47--54", month = sep, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chaudhry:1994:CMP, author = "Ghulam Chaudhry and Xuechang Li", title = "A case for the multithreaded processor architecture", journal = j-COMP-ARCH-NEWS, volume = "22", number = "4", pages = "55--59", month = sep, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chan:1994:ECF, author = "Yin Chan and Ashok Sudarsanam and Andrew Wolfe", title = "The effect of compiler-flag tuning on {SPEC} benchmark performance", journal = j-COMP-ARCH-NEWS, volume = "22", number = "4", pages = "60--70", month = sep, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:1994:RCC, author = "Jin-Ho Lee and Min-Young Lee and Seong-Uk Choi and Myong-Soon Park", title = "Reducing cache conflicts in data cache prefetching", journal = j-COMP-ARCH-NEWS, volume = "22", number = "4", pages = "71--77", month = sep, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1994:UNc, author = "Mark Thorson", title = "{Usenet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "22", number = "4", pages = "78--81", month = sep, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Forsell:1994:MMPb, author = "Martti J. Forsell", title = "Are multiport memories physically feasible?", journal = j-COMP-ARCH-NEWS, volume = "22", number = "5", pages = "3--10", month = dec, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sosic:1994:HCH, author = "Rok Sosi{\v{c}}", title = "History cache: hardware support for reverse execution", journal = j-COMP-ARCH-NEWS, volume = "22", number = "5", pages = "11--18", month = dec, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hill:1994:WWT, author = "Mark D. Hill and James R. Larus and David A. Wood", title = "The {Wisconsin Wind Tunnel} project: an annotated bibliography", journal = j-COMP-ARCH-NEWS, volume = "22", number = "5", pages = "19--26", month = dec, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Saha:1994:DDT, author = "Avijit Saha and Nadeem Malik", title = "Distributed directory tags", journal = j-COMP-ARCH-NEWS, volume = "22", number = "5", pages = "27--29", month = dec, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Unwala:1994:SMP, author = "Ishaq H. Unwala and Harvey G. Cragon", title = "A study of {MIPS} programs", journal = j-COMP-ARCH-NEWS, volume = "22", number = "5", pages = "30--40", month = dec, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1994:IN, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "22", number = "5", pages = "41--46", month = dec, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ohnemus:1994:BIL, author = "Kenneth R. Ohnemus and Diana F. Mallin", title = "Benefits of implementing on-line methods and procedures", journal = j-COMP-ARCH-NEWS, volume = "22", number = "5", pages = "49--55", month = dec, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cunningham:1994:LDT, author = "Daniel K. Cunningham and Steven J. Reilly", title = "Leading the design team---the evolution of the technical writer from a support role to a design role", journal = j-COMP-ARCH-NEWS, volume = "22", number = "5", pages = "56--60", month = dec, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rockley:1994:MTE, author = "Ann Rockley", title = "Multimedia: towards an electronic performance support system", journal = j-COMP-ARCH-NEWS, volume = "22", number = "5", pages = "61--65", month = dec, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Drew:1994:TTM, author = "Katherine E. Drew", title = "Telecommunicators and telecommuters: making multiple-site documentation projects work", journal = j-COMP-ARCH-NEWS, volume = "22", number = "5", pages = "66--75", month = dec, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Severson:1995:TCP, author = "Aimee Severson and Brent Nelson", title = "Throughput in a counterflow pipeline processor", journal = j-COMP-ARCH-NEWS, volume = "23", number = "1", pages = "5--12", month = mar, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hsu:1995:SAC, author = "Tsong-Chih Hsu and Sheng-De Wang", title = "A simple architecture for constant time sorting machines", journal = j-COMP-ARCH-NEWS, volume = "23", number = "1", pages = "13--19", month = mar, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wulf:1995:HMW, author = "Wm. A. Wulf and Sally A. McKee", title = "Hitting the memory wall: implications of the obvious", journal = j-COMP-ARCH-NEWS, volume = "23", number = "1", pages = "20--24", month = mar, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1995:INa, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "23", number = "1", pages = "25--28", month = mar, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Agarwal:1995:AMA, author = "Anant Agarwal and Ricardo Bianchini and David Chaiken and Kirk L. Johnson and David Kranz and John Kubiatowicz and Beng-Hong Lim and Kenneth Mackenzie and Donald Yeung", title = "The {MIT Alewife} machine: architecture and performance", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "2--13", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kodama:1995:EXP, author = "Yuetsu Kodama and Hirohumi Sakane and Mitsuhisa Sato and Hayato Yamana and Shuichi Sakai and Yoshinori Yamaguchi", title = "The {EM-X} parallel computer: architecture and basic performance", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "14--23", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Woo:1995:SPC, author = "Steven Cameron Woo and Moriyoshi Ohara and Evan Torrie and Jaswinder Pal Singh and Anoop Gupta", title = "The {SPLASH-2} programs: characterization and methodological considerations", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "24--36", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Grahn:1995:ESS, author = "H{\aa}kan Grahn and Per Stenstr{\"o}m", title = "Efficient strategies for software-only protocols in shared-memory multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "38--47", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lebeck:1995:DSI, author = "Alvin R. Lebeck and David A. Wood", title = "Dynamic self-invalidation: reducing coherence overhead in shared-memory multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "48--59", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dahlgren:1995:BPH, author = "Fredrik Dahlgren", title = "Boosting the performance of hybrid snooping cache protocols", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "60--69", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nowatzyk:1995:CNW, author = "Andreas G. Nowatzyk and Michael C. Browne and Edmund J. Kelly and Michael Parkin", title = "{S}-connect: from networks of workstations to supercomputer performance", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "71--82", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Varma:1995:DAD, author = "Anujan Varma and Quinn Jacobson", title = "Destage algorithms for disk arrays with non-volatile caches", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "83--95", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stoll:1995:EMP, author = "Gordon Stoll and Bin Wei and Douglas Clark and Edward W. Felten and Kai Li and Patrick Hanrahan", title = "Evaluating multi-port frame buffer designs for a mesh-connected multicomputer", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "96--105", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nowatzyk:1995:CRD, author = "Andreas G. Nowatzyk and Paul R. Prucnal", title = "Are crossbars really dead?: the case for optical multiprocessor interconnect systems", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "106--115", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jourdan:1995:ECF, author = "St{\'e}phan Jourdan and Pascal Sainrat and Daniel Litaize", title = "Exploring configurations of functional units in an out-of-order superscalar processor", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "117--125", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ando:1995:USE, author = "Hideki Ando and Chikako Nakanishi and Tetsuya Hara and Masao Nakaya", title = "Unconstrained speculative execution with predicated state buffering", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "126--137", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mahlke:1995:CFP, author = "Scott A. Mahlke and Richard E. Hank and James E. McCormick and David I. August and Wen-Mei W. Hwu", title = "A comparison of full and partial predicated execution support for {ILP} processors", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "138--150", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Simone:1995:ITO, author = "M. Simone and A. Essen and A. Ike and A. Krishnamoorthy and T. Maruyama and N. Patkar and M. Ramaswami and M. Shebanow and V. Thirumalaiswamy and D. Tovey", title = "Implementation trade-offs in using a restricted data flow architecture in a high performance {RISC} microprocessor", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "151--162", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Diep:1995:PEP, author = "Trung A. Diep and Christopher Nelson and John Paul Shen", title = "Performance evaluation of the {PowerPC 620} microarchitecture", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "163--174", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Romer:1995:RTM, author = "Theodore H. Romer and Wayne H. Ohlrich and Anna R. Karlin and Brian N. Bershad", title = "Reducing {TLB} and memory overhead using online superpage promotion", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "176--187", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhang:1995:SIA, author = "Zheng Zhang and Josep Torrellas", title = "Speeding up irregular applications in shared-memory multiprocessors: memory binding and group prefetching", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "188--199", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anjan:1995:EFA, author = "K. V. Anjan and Timothy Mark Pinkston", title = "An efficient, fully adaptive deadlock recovery scheme: {DISHA}", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "201--210", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shin:1995:AIH, author = "Kang G. Shin and Stuart W. Daniel", title = "Analysis and implementation of hybrid switching", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "211--219", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dao:1995:CFC, author = "Binh Vien Dao and Jose Duato and Sudhakar Yalamanchili", title = "Configurable flow control mechanisms for fault-tolerant routing", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "220--229", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Callahan:1995:NLO, author = "Timothy Callahan and Seth Copen Goldstein", title = "{NIFDY}: a low overhead, high throughput network interface", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "230--241", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Peiron:1995:VMA, author = "Montse Peiron and Mateo Valero and Eduard Ayguad{\'e} and Tom{\'a}s Lang", title = "Vector multiprocessors with arbitrated memory access", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "243--252", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kavi:1995:DCM, author = "Krishna M. Kavi and A. R. Hurson and Phenil Patadia and Elizabeth Abraham and Ponnarasu Shanmugam", title = "Design of cache memories for multi-threaded dataflow architecture", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "253--264", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bodin:1995:SAE, author = "Fran{\c{c}}ois Bodin and Andr{\'e} Seznec", title = "Skewed associativity enhances performance predictability", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "265--274", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Young:1995:CAS, author = "Cliff Young and Nicolas Gloy and Michael D. Smith", title = "A comparative analysis of schemes for correlated branch prediction", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "276--286", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Calder:1995:NCL, author = "Brad Calder and Dirk Grunwald", title = "Next cache line and set prediction", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "287--296", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Karamcheti:1995:CAS, author = "Vijay Karamcheti and Andrew A. Chien", title = "A comparison of architectural support for messaging in the {TMC CM-5} and the {Cray T3D}", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "298--307", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stricker:1995:OMS, author = "T. Stricker and T. Gross", title = "Optimizing memory system performance for communication in parallel computers", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "308--319", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Arpaci:1995:EEC, author = "Remzi H. Arpaci and David E. Culler and Arvind Krishnamurthy and Steve G. Steinberg and Katherine Yelick", title = "Empirical evaluation of the {CRAY-T$3$D}: a compiler perspective", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "320--331", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Conte:1995:OIF, author = "Thomas M. Conte and Kishore N. Menezes and Patrick M. Mills and Burzin A. Patel", title = "Optimization of instruction fetch mechanisms for high issue rates", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "333--344", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Uhlig:1995:IFC, author = "Richard Uhlig and David Nagle and Trevor Mudge and Stuart Sechrest and Joel Emer", title = "Instruction fetching: coping with code bloat", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "345--356", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:1995:ICF, author = "Dennis Lee and Jean-Loup Baer and Brad Calder and Dirk Grunwald", title = "Instruction cache fetch policies for speculative execution", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "357--367", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Austin:1995:SDC, author = "Todd M. Austin and Dionisios N. Pnevmatikatos and Gurindar S. Sohi", title = "Streamlining data cache access with fast address calculation", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "369--380", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wang:1995:CCA, author = "Hong Wang and Tong Sun and Qing Yang", title = "{CAT}---caching address tags: a technique for reducing area cost of on-chip caches", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "381--390", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tullsen:1995:SMM, author = "Dean M. Tullsen and Susan J. Eggers and Henry M. Levy", title = "Simultaneous multithreading: maximizing on-chip parallelism", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "392--403", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ho:1995:AVP, author = "Richard C. Ho and C. Han Yang and Mark A. Horowitz and David L. Dill", title = "Architecture validation for processors", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "404--413", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sohi:1995:MP, author = "Gurindar S. Sohi and Scott E. Breach and T. N. Vijaykumar", title = "Multiscalar processors", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "414--425", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Beckmann:1995:HPM, author = "Carl J. Beckmann", title = "{HTGL}: a program modelling language", journal = j-COMP-ARCH-NEWS, volume = "23", number = "3", pages = "3--10", month = jun, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lafitte:1995:SDH, author = "Jean-Louis Lafitte", title = "On structured data handling in parallel processing", journal = j-COMP-ARCH-NEWS, volume = "23", number = "3", pages = "11--18", month = jun, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ulmann:1995:ESB, author = "B. Ulmann", title = "{o$ \mu $-EP-1}: a simple 32-bit architecture", journal = j-COMP-ARCH-NEWS, volume = "23", number = "3", pages = "19--24", month = jun, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1995:INb, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "23", number = "3", pages = "25--27", month = jun, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tabak:1995:CMH, author = "Daniel Tabak", title = "{{\em Cache and Memory Hierarchy Design: A Performance-Directed Approach\/}} by {Steven A. Przybylski}", journal = j-COMP-ARCH-NEWS, volume = "23", number = "3", pages = "28--28", month = jun, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wilkes:1995:MWC, author = "Maurice V. Wilkes", title = "The memory wall and the {CMOS} end-point", journal = j-COMP-ARCH-NEWS, volume = "23", number = "4", pages = "4--6", month = sep, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:13 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Johnson:1995:GMW, author = "Eric E. Johnson", title = "Graffiti on ``the memory wall''", journal = j-COMP-ARCH-NEWS, volume = "23", number = "4", pages = "7--8", month = sep, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:13 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Afzal:1995:PMU, author = "Tariq Afzal", title = "Performance modeling using the {Motorola PowerPC} timing simulator", journal = j-COMP-ARCH-NEWS, volume = "23", number = "4", pages = "9--18", month = sep, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:13 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Parhami:1995:SMD, author = "Behrooz Parhami", title = "{SIMD} machines: do they have a significant future?", journal = j-COMP-ARCH-NEWS, volume = "23", number = "4", pages = "19--22", month = sep, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:13 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jain:1995:AAE, author = "Ravi Jain and John Werth", title = "Airdisks and {airRAID} (expanded extract): modeling and scheduling periodic wireless data broadcast", journal = j-COMP-ARCH-NEWS, volume = "23", number = "4", pages = "23--28", month = sep, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:13 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kontothanassis:1995:ESM, author = "Leonidas I. Kontothanassis and Michael L. Scott", title = "Efficient shared memory with minimal hardware support", journal = j-COMP-ARCH-NEWS, volume = "23", number = "4", pages = "29--35", month = sep, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:13 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gschwind:1995:VP, author = "Michael K. Gschwind and Thomas J. Pietsch", title = "Vector prefetching", journal = j-COMP-ARCH-NEWS, volume = "23", number = "5", pages = "1--7", month = dec, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Karne:1995:OOC, author = "Ramesh K. Karne", title = "Object-oriented computer architectures for new generation of applications", journal = j-COMP-ARCH-NEWS, volume = "23", number = "5", pages = "8--19", month = dec, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Khalid:1995:URA, author = "Humayun Khalid", title = "The unconventional replacement algorithms", journal = j-COMP-ARCH-NEWS, volume = "23", number = "5", pages = "20--26", month = dec, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Khalid:1995:TDS, author = "Humayun Khalid", title = "A trace-driven simulation methodology", journal = j-COMP-ARCH-NEWS, volume = "23", number = "5", pages = "27--33", month = dec, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mirghafori:1995:TSB, author = "Nikki Mirghafori and Margret Jacoby and David Patterson", title = "Truth in {SPEC} benchmarks", journal = j-COMP-ARCH-NEWS, volume = "23", number = "5", pages = "34--42", month = dec, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1995:INc, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "23", number = "5", pages = "43--44", month = dec, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mudge:1996:RPH, author = "Trevor Mudge", title = "Report on the panel: {``How Can Computer Architecture Researchers Avoid Becoming the Society for Irreproducible Results?''}", journal = j-COMP-ARCH-NEWS, volume = "24", number = "1", pages = "1--5", month = mar, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kwon:1996:COR, author = "Oh-Young Kwon and Gi-Ho Park and Tack-Don Han", title = "A compiler optimization to reduce execution time of loop nest", journal = j-COMP-ARCH-NEWS, volume = "24", number = "1", pages = "6--11", month = mar, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1996:INa, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "24", number = "1", pages = "12--16", month = mar, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tabak:1996:BRA, author = "Daniel Tabak", title = "Book Review: {{\em Alpha Implementations and Architecture\/}} by {Dileep P. Bhandarkar}", journal = j-COMP-ARCH-NEWS, volume = "24", number = "1", pages = "17--18", month = mar, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Evers:1996:UHB, author = "Marius Evers and Po-Yung Chang and Yale N. Patt", title = "Using hybrid branch predictors to improve branch prediction accuracy in the presence of context switches", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "3--11", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gloy:1996:ADB, author = "Nicolas Gloy and Cliff Young and J. Bradley Chen and Michael D. Smith", title = "An analysis of dynamic branch prediction schemes on system workloads", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "12--21", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sechrest:1996:CAD, author = "Stuart Sechrest and Chih-Chieh Lee and Trevor Mudge", title = "Correlation and aliasing in dynamic branch predictors", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "22--32", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Reinhardt:1996:DHS, author = "Steven K. Reinhardt and Robert W. Pfile and David A. Wood", title = "Decoupled hardware support for distributed shared memory", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "34--43", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yeung:1996:MMS, author = "Donald Yeung and John Kubiatowicz and Anant Agarwal", title = "{MGS}: a multigrain shared memory system", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "44--55", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Morin:1996:COB, author = "Christine Morin and Alain Gefflaut and Michel Ban{\^a}tre and Anne-Marie Kermarrec", title = "{COMA}: an opportunity for building fault-tolerant scalable shared memory multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "56--65", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nayfeh:1996:EDA, author = "Basem A. Nayfeh and Lance Hammond and Kunle Olukotun", title = "Evaluation of design alternatives for a multiprocessor microprocessor", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "67--77", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Burger:1996:MBL, author = "Doug Burger and James R. Goodman and Alain K{\"a}gi", title = "Memory bandwidth limitations of future microprocessors", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "78--89", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Saulsbury:1996:MMW, author = "Ashley Saulsbury and Fong Pong and Andreas Nowatzyk", title = "Missing the memory wall: the case for processor\slash memory integration", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "90--101", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Seznec:1996:DUP, author = "Andr{\'e} Seznec", title = "Don't use the page number, but a pointer to it", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "104--113", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Juan:1996:DBC, author = "Toni Juan and Tom{\'a}s Lang and Juan J. Navarro", title = "The difference-bit cache", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "114--120", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Iftode:1996:UAP, author = "Liviu Iftode and Jaswinder Pal Singh and Kai Li", title = "Understanding application performance on shared virtual memory systems", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "122--133", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Holt:1996:AAB, author = "Chris Holt and Jaswinder Pal Singh and John Hennessy", title = "Application and architectural bottlenecks in large scale distributed shared memory machines", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "134--145", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wilson:1996:ICP, author = "Kenneth M. Wilson and Kunle Olukotun and Mendel Rosenblum", title = "Increasing cache port efficiency for dynamic superscalar microprocessors", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "147--157", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Austin:1996:HBA, author = "Todd M. Austin and Gurindar S. Sohi", title = "High-bandwidth address translation for multiple-issue processors", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "158--167", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hu:1996:DDC, author = "Yiming Hu and Qing Yang", title = "{DCD}---disk caching disk: a new approach for boosting {I/O} performance", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "169--178", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Maquelin:1996:PWC, author = "Olivier Maquelin and Guang R. Gao and Herbert H. J. Hum and Kevin B. Theobald and Xin-Min Tian", title = "Polling watchdog: combining polling and interrupts for efficient message handling", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "179--188", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tullsen:1996:ECI, author = "Dean M. Tullsen and Susan J. Eggers and Joel S. Emer and Henry M. Levy and Jack L. Lo and Rebecca L. Stamm", title = "Exploiting choice: instruction fetch and issue on an implementable simultaneous multithreading processor", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "191--202", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Eickemeyer:1996:EMU, author = "Richard J. Eickemeyer and Ross E. Johnson and Steven R. Kunkel and Mark S. Squillante and Shiafun Liu", title = "Evaluation of multithreaded uniprocessors for commercial application environments", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "203--212", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hara:1996:PCI, author = "Tetsuya Hara and Hideki Ando and Chikako Nakanishi and Masao Nakaya", title = "Performance comparison of {ILP} machines with cycle time evaluation", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "213--224", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kim:1996:RCQ, author = "Jae H. Kim and Andrew A. Chien", title = "Rotating combined queueing {(RCQ)}: bandwidth and latency guarantees in low-cost, high-performance networks", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "226--236", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rexford:1996:RAR, author = "Jennifer Rexford and John Hall and Kang G. Shin", title = "A router architecture for real-time point-to-point networks", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "237--246", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mukherjee:1996:CNI, author = "Shubhendu S. Mukherjee and Babak Falsafi and Mark D. Hill and David A. Wood", title = "Coherent network interfaces for fine-grain communication", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "247--258", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Horowitz:1996:IMO, author = "Mark Horowitz and Margaret Martonosi and Todd C. Mowry and Michael D. Smith", title = "Informing memory operations: providing memory performance feedback in modern processors", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "260--270", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Xia:1996:IPS, author = "Chun Xia and Josep Torrellas", title = "Instruction prefetching of systems codes with layout optimized for reduced cache misses", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "271--282", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Choi:1996:CHS, author = "Lynn Choi and Pen-Chung Yew", title = "Compiler and hardware support for cache coherence in large-scale multiprocessors: design considerations and performance study", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "283--294", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Felten:1996:EEM, author = "Edward W. Felten and Richard D. Alpert and Angelos Bilas and Matthias A. Blumrich and Douglas W. Clark and Stefanos N. Damianakis and Cezary Dubnicki and Liviu Iftode and Kai Li", title = "Early experience with message-passing on the {SHRIMP} multicomputer", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "296--307", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lovett:1996:SCN, author = "Tom Lovett and Russell Clapp", title = "{STiNG}: a {CC-NUMA} computer system for the commercial marketplace", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "308--317", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Carretero:1996:MPD, author = "J. Carretero and F. P{\'e}rez and P. de Miguel and F. Garc{\'\i}a and L. Alonso", title = "A massively parallel and distributed {I/O} subsystem", journal = j-COMP-ARCH-NEWS, volume = "24", number = "3", pages = "1--8", month = jun, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ligon:1996:DLB, author = "W. B. {Ligon III} and Daniel C. {Stanzione, Jr.}", title = "Distributing and load-balancing for loops in scientific applications", journal = j-COMP-ARCH-NEWS, volume = "24", number = "3", pages = "9--17", month = jun, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Belayneh:1996:DNBa, author = "Samson Belayneh and David R. Kaeli", title = "A discussion on non-blocking\slash lockup-free caches", journal = j-COMP-ARCH-NEWS, volume = "24", number = "3", pages = "18--25", month = jun, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1996:INb, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "24", number = "3", pages = "26--32", month = jun, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Paez-Monzon:1996:RPD, author = "Gerard P{\'a}ez-Monz{\'o}n and Charles P{\'a}ez-Monz{\'o}n", title = "The {RISC} processor {DMN-6}: a unified data-control flow architecture", journal = j-COMP-ARCH-NEWS, volume = "24", number = "4", pages = "3--10", month = sep, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:13 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pulido:1996:ETT, author = "J. A. G{\'o}mez Pulido and J. M. S{\'a}nchez P{\'e}rez and J. A. Moreno Zamora", title = "An educational tool for testing hierarchical multilevel caches", journal = j-COMP-ARCH-NEWS, volume = "24", number = "4", pages = "11--15", month = sep, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:13 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Belayneh:1996:DNBb, author = "Samson Belayneh and David R. Kaeli", title = "A discussion on non-blocking\slash lockup-free caches", journal = j-COMP-ARCH-NEWS, volume = "24", number = "4", pages = "16--16", month = sep, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:13 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rosenbaum:1996:AP, author = "Mark Rosenbaum", title = "Architectural potholes", journal = j-COMP-ARCH-NEWS, volume = "24", number = "4", pages = "17--18", month = sep, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:13 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mashey:1996:AP, author = "John Mashey", title = "Architectural potholes", journal = j-COMP-ARCH-NEWS, volume = "24", number = "4", pages = "18--18", month = sep, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:13 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cockcroft:1996:P, author = "Adrian Cockcroft", title = "{I/O} potholes", journal = j-COMP-ARCH-NEWS, volume = "24", number = "4", pages = "18--19", month = sep, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:13 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ebrahim:1996:P, author = "Zahir Ebrahim", title = "{I/O} potholes", journal = j-COMP-ARCH-NEWS, volume = "24", number = "4", pages = "19--20", month = sep, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:13 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Carlile:1996:IB, author = "Brad Carlile", title = "Interpreting benchmarks", journal = j-COMP-ARCH-NEWS, volume = "24", number = "4", pages = "20--21", month = sep, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:13 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chase:1996:RW, author = "David Chase", title = "Register windows", journal = j-COMP-ARCH-NEWS, volume = "24", number = "4", pages = "21--21", month = sep, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:13 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{DeMone:1996:RWD, author = "Paul W. DeMone", title = "Register windows and delay slots", journal = j-COMP-ARCH-NEWS, volume = "24", number = "4", pages = "21--22", month = sep, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:13 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rose:1996:CIT, author = "Charlton D. Rose and J. Kelly Flanagan", title = "Constructing instruction traces from cache-filtered address traces {(CITCAT)}", journal = j-COMP-ARCH-NEWS, volume = "24", number = "5", pages = "1--8", month = dec, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hummel:1996:EDS, author = "Susan Flynn Hummel", title = "Efficient data sharing with conditional remote memory transfers", journal = j-COMP-ARCH-NEWS, volume = "24", number = "5", pages = "9--17", month = dec, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Widigen:1996:EOR, author = "Larry Widigen and Elliot Sowadsky and Kevin McGrath", title = "Eliminating operand read latency", journal = j-COMP-ARCH-NEWS, volume = "24", number = "5", pages = "18--22", month = dec, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Machanick:1996:CSM, author = "Philip Machanick", title = "The case for {SRAM} main memory", journal = j-COMP-ARCH-NEWS, volume = "24", number = "5", pages = "23--30", month = dec, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:20 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bhandarkar:1997:RVC, author = "Dileep Bhandarkar", title = "{RISC} versus {CISC}: a tale of two chips", journal = j-COMP-ARCH-NEWS, volume = "25", number = "1", pages = "1--12", month = mar, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Martin:1997:SCM, author = "I. Mart{\'\i}n and F. Tirado", title = "A {SIMD} computer for multigrid methods", journal = j-COMP-ARCH-NEWS, volume = "25", number = "1", pages = "13--18", month = mar, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Weicker:1997:USB, author = "Reinhold Weicker", title = "On the use of {SPEC} benchmarks in computer architecture research", journal = j-COMP-ARCH-NEWS, volume = "25", number = "1", pages = "19--22", month = mar, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mukherjee:1997:WSG, author = "Shubhendu S. Mukherjee", title = "What should graduate students know before joining a large computer architecture project?", journal = j-COMP-ARCH-NEWS, volume = "25", number = "1", pages = "23--26", month = mar, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Khalid:1997:NCR, author = "Humayun Khalid", title = "A new cache replacement scheme based on backpropagation neural networks", journal = j-COMP-ARCH-NEWS, volume = "25", number = "1", pages = "27--33", month = mar, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1997:INa, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "25", number = "1", pages = "34--36", month = mar, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vajapeyam:1997:ISI, author = "Sriram Vajapeyam and Tulika Mitra", title = "Improving superscalar instruction dispatch and issue by exploiting dynamic code sequences", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "1--12", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nair:1997:EIL, author = "Ravi Nair and Martin E. Hopkins", title = "Exploiting instruction level parallelism in processors by caching scheduled groups", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "13--25", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ebcioglu:1997:DDC, author = "Kemal Ebcio{\u{g}}lu and Erik R. Altman", title = "{DAISY}: dynamic compilation for 100\% architectural compatibility", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "26--37", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pinkston:1997:DIN, author = "Timothy Mark Pinkston and Sugath Warnakulasuriya", title = "On deadlocks in interconnection networks", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "38--49", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Stunkel:1997:IMW, author = "Craig B. Stunkel and Rajeev Sivaram and Dhabaleswar K. Panda", title = "Implementing multidestination worms in switch-based parallel systems: architectural alternatives and their impact", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "50--61", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Alvarez:1997:TMF, author = "Guillermo A. Alvarez and Walter A. Burkhard and Flaviu Cristian", title = "Tolerating multiple failures in {RAID} architectures with optimal storage and uniform declustering", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "62--72", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Teodosiu:1997:HFC, author = "Dan Teodosiu and Joel Baxter and Kinshuk Govil and John Chapin and Mendel Rosenblum and Mark Horowitz", title = "Hardware fault containment in scalable shared-memory multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "73--84", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Martin:1997:ECL, author = "Richard P. Martin and Amin M. Vahdat and David E. Culler and Thomas E. Anderson", title = "Effects of communication latency, overhead, and bandwidth in a cluster architecture", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "85--97", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Weber:1997:MIA, author = "Wolf-Dietrich Weber and Stephen Gold and Pat Helland and Takeshi Shimizu and Thomas Wicki and Winfried Wilcke", title = "The {Mercury Interconnect Architecture}: a cost-effective infrastructure for high-performance servers", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "98--107", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hakura:1997:DAC, author = "Ziyad S. Hakura and Anoop Gupta", title = "The design and analysis of a cache architecture for texture mapping", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "108--120", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wilson:1997:DHB, author = "Kenneth M. Wilson and Kunle Olukotun", title = "Designing high bandwidth on-chip caches", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "121--132", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Farkas:1997:MSD, author = "Keith I. Farkas and Paul Chow and Norman P. Jouppi and Zvonko Vranesic", title = "Memory-system design considerations for dynamically-scheduled processors", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "133--143", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ranganathan:1997:ISP, author = "Parthasarathy Ranganathan and Vijay S. Pai and Hazim Abdel-Shafi and Sarita V. Adve", title = "The interaction of software prefetching with {ILP} processors in shared-memory systems", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "144--156", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kontothanassis:1997:VBS, author = "Leonidas Kontothanassis and Galen Hunt and Robert Stets and Nikolaos Hardavellas and Micha{\l} Cierniak and Srinivasan Parthasarathy and Wagner {Meira, Jr.} and Sandhya Dwarkadas and Michael Scott", title = "{VM}-based shared memory on low-latency, remote-memory-access networks", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "157--169", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kagi:1997:ESL, author = "Alain K{\"a}gi and Doug Burger and James R. Goodman", title = "Efficient synchronization: let them eat {QOLB}", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "170--180", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Moshovos:1997:DSS, author = "Andreas Moshovos and Scott E. Breach and T. N. Vijaykumar and Gurindar S. Sohi", title = "Dynamic speculation and synchronization of data dependences", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "181--193", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sodani:1997:DIR, author = "Avinash Sodani and Gurindar S. Sohi", title = "Dynamic instruction reuse", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "194--205", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Palacharla:1997:CES, author = "Subbarao Palacharla and Norman P. Jouppi and J. E. Smith", title = "Complexity-effective superscalar processors", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "206--218", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Michael:1997:CCA, author = "Maged M. Michael and Ashwini K. Nanda and Beng-Hong Lim and Michael L. Scott", title = "Coherence controller architectures for {SMP}-based {CC-NUMA} multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "219--228", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Falsafi:1997:RND, author = "Babak Falsafi and David A. Wood", title = "Reactive {NUMA}: a design for unifying {S-COMA} and {CC-NUMA}", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "229--240", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Laudon:1997:SOC, author = "James Laudon and Daniel Lenoski", title = "The {SGI Origin}: a {ccNUMA} highly scalable server", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "241--251", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Joseph:1997:PUM, author = "Doug Joseph and Dirk Grunwald", title = "Prefetching using {Markov} predictors", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "252--263", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Santhanam:1997:DPH, author = "Vatsa Santhanam and Edward H. Gornish and Wei-Chung Hsu", title = "Data prefetching on the {HP PA-8000}", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "264--273", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chang:1997:TPI, author = "Po-Yung Chang and Eric Hao and Yale N. Patt", title = "Target prediction for indirect jumps", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "274--283", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sprangle:1997:APM, author = "Eric Sprangle and Robert S. Chappell and Mitch Alsup and Yale N. Patt", title = "The agree predictor: a mechanism for reducing negative branch history interference", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "284--291", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Michaud:1997:TCC, author = "Pierre Michaud and Andr{\'e} Seznec and Richard Uhlig", title = "Trading conflict and capacity aliasing in conditional branch predictors", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "292--303", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Emer:1997:LDP, author = "Joel Emer and Nikolas Gloy", title = "A language for describing predictors and its application to automatic synthesis", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "304--314", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Johnson:1997:RTA, author = "Teresa L. Johnson and Wen-mei W. Hwu", title = "Run-time adaptive cache hierarchy management via reference analysis", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "315--326", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fromm:1997:EEI, author = "Richard Fromm and Stylianos Perissakis and Neal Cardwell and Christoforos Kozyrakis and Bruce McGaughy and David Patterson and Tom Anderson and Katherine Yelick", title = "The energy efficiency of {IRAM} architectures", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "327--337", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Burger:1997:DA, author = "Doug Burger and Stefanos Kaxiras and James R. Goodman", title = "{DataScalar} architectures", journal = j-COMP-ARCH-NEWS, volume = "25", number = "2", pages = "338--349", month = may, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wilkes:1997:CLS, author = "Maurice Wilkes and Andrew Hopper", title = "The collapsed {LAN}: a solution to a bandwidth problem?", journal = j-COMP-ARCH-NEWS, volume = "25", number = "3", pages = "1--5", month = jun, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jokinen:1997:CDP, author = "Tommi Jokinen and Chia-Jiu Wang", title = "Cache design with path balancing table, skewing and indirect tags", journal = j-COMP-ARCH-NEWS, volume = "25", number = "3", pages = "6--12", month = jun, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Burger:1997:STS, author = "Doug Burger and Todd M. Austin", title = "The {SimpleScalar} tool set, version 2.0", journal = j-COMP-ARCH-NEWS, volume = "25", number = "3", pages = "13--25", month = jun, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1997:INb, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "25", number = "3", pages = "26--27", month = jun, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{VanMeter:1997:RCL, author = "Rodney {Van Meter} and Greg Finn and Steve Hotz and Dave Dyer", title = "Response to the collapsed {LAN}", journal = j-COMP-ARCH-NEWS, volume = "25", number = "4", pages = "1--12", month = sep, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:14 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hu:1997:OES, author = "Weiwu Hu and Peisu Xia", title = "Out-of-order execution in sequentially consistent shared-memory systems", journal = j-COMP-ARCH-NEWS, volume = "25", number = "4", pages = "3--10", month = sep, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:14 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Khalid:1997:NTS, author = "Humayun Khalid", title = "A novel trace sampling technique", journal = j-COMP-ARCH-NEWS, volume = "25", number = "4", pages = "11--16", month = sep, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:14 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Khalid:1997:PKC, author = "Humayun Khalid", title = "Performance of the {KORA-2} cache replacement scheme", journal = j-COMP-ARCH-NEWS, volume = "25", number = "4", pages = "17--21", month = sep, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:14 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jutla:1997:IAP, author = "D. N. Jutla and P. Bodorik", title = "Improving applications performance: a memory model and cache architecture", journal = j-COMP-ARCH-NEWS, volume = "25", number = "4", pages = "22--29", month = sep, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:14 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ulmann:1997:NEP, author = "B. Ulmann", title = "{NICE}: an elegant and powerful 32-bit architecture", journal = j-COMP-ARCH-NEWS, volume = "25", number = "4", pages = "30--35", month = sep, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:14 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1997:INc, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "25", number = "4", pages = "36--41", month = sep, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:14 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pai:1997:RRS, author = "Vijay S. Pai and Parthasarathy Ranganathan and Sarita V. Adve", title = "{RSIM}: {Rice} simulator for {ILP} multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "25", number = "5", pages = "1--1", month = dec, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:21 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shi:1997:IID, author = "Weisong Shi and Weiwu Hu and Ming Zhu", title = "An innovative implementation for directory-based cache coherence in shared memory multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "25", number = "5", pages = "2--9", month = dec, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:21 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1997:INd, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "25", number = "5", pages = "10--14", month = dec, year = "1997", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:21 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ulmann:1998:ILE, author = "B. Ulmann", title = "Instruction looping, an extension to conditional execution", journal = j-COMP-ARCH-NEWS, volume = "26", number = "1", pages = "3--4", month = mar, year = "1998", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1216461.1216462", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:32 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The following article describes an easy to implement but very powerful extension to simple conditional execution based program flow control as used for example in the ARM RISC processors and others.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Haring:1998:IWP, author = "G{\"u}nter Haring and Christoph Lindemann and Martin Reiser", title = "International workshop performance evaluation --- origins and directions", journal = j-COMP-ARCH-NEWS, volume = "26", number = "1", pages = "5--6", month = mar, year = "1998", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1216461.1216463", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:32 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Performance Evaluation is a discipline of Computer Science for some thirty years. It seems time to take stock of what we were doing. That is, provide answers to the following questions:{\bullet} What are its scientific contributions?{\bullet} What is its relevance in industry and business?{\bullet} What is its standing in academia?{\bullet} Where is the field headed?{\bullet} What are its success stories and failures?{\bullet} What are its current burning questions?", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Munsil:1998:RSU, author = "Wes Munsil and Chia-Jiu Wang", title = "Reducing stack usage in {Java} bytecode execution", journal = j-COMP-ARCH-NEWS, volume = "26", number = "1", pages = "7--11", month = mar, year = "1998", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1216461.1216464", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:32 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "For many years, the Tomasulo method of dynamically scheduling instructions for execution in a load/store processor has been known and used. This paper presents an adaptation of the Tomasulo method to a stack-based processor architecture, and illustrates its use in a software simulator of a subset of the Java Virtual Machine. Experimental results show that the adapted Tomasulo method reduces stack usage, in some cases eliminating it altogether. This method should be of interest to computer architects and those involved in the implementation and use of the Java programming language.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1998:INaa, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "26", number = "1", pages = "12--17", month = mar, year = "1998", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1216461.1216465", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:32 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This column consists of selected traffic from the comp.arch newsgroup, a forum for discussion of computer architecture on Internet --- an international computer network. As always, the opinions expressed in this column are the personal views of the authors, and do not necessarily represent the institutions to which they are affiliated. Text which sets the context of a message appears in italics; this is usually text the author has quoted from earlier messages. The code-like expressions below the authors' names are their addresses on Internet.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Moudgill:1998:TFS, author = "Mayan Moudgill", title = "Techniques for fast simulation of associative cache directories", journal = j-COMP-ARCH-NEWS, volume = "26", number = "2", pages = "1--8", month = may, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chung:1998:LBC, author = "Byung-Kwon Chung and Jih-Kwon Peir", title = "{LRU}-based column-associative caches", journal = j-COMP-ARCH-NEWS, volume = "26", number = "2", pages = "9--17", month = may, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1998:INb, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "26", number = "2", pages = "18--22", month = may, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Barroso:1998:MSC, author = "Luiz Andr{\'e} Barroso and Kourosh Gharachorloo and Edouard Bugnion", title = "Memory system characterization of commercial workloads", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "3--14", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Keeton:1998:PCQ, author = "Kimberly Keeton and David A. Patterson and Yong Qiang He and Roger C. Raphael and Walter E. Baker", title = "Performance characterization of a {Quad Pentium Pro SMP} using {OLTP} workloads", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "15--26", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:1998:ECD, author = "Dennis C. Lee and Patrick J. Crowley and Jean-Loup Baer and Thomas E. Anderson and Brian N. Bershad", title = "Execution characteristics of desktop applications on {Windows NT}", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "27--38", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lo:1998:ADW, author = "Jack L. Lo and Luiz Andr{\'e} Barroso and Susan J. Eggers and Kourosh Gharachorloo and Henry M. Levy and Sujay S. Parekh", title = "An analysis of database workload performance on simultaneous multithreaded processors", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "39--50", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Evers:1998:ACP, author = "Marius Evers and Sanjay J. Patel and Robert S. Chappell and Yale N. Patt", title = "An analysis of correlation and predictability: what makes two-level branch predictors work", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "52--61", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Federovsky:1998:BPB, author = "Eitan Federovsky and Meir Feder and Sholomo Weiss", title = "Branch prediction based on universal data compression algorithms", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "62--72", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sazeides:1998:MPP, author = "Yiannakis Sazeides and James E. Smith", title = "Modeling program predictability", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "73--84", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cox:1998:MLT, author = "Michael Cox and Narendra Bhandari and Michael Shantz", title = "Multi-level texture caching for {$3$D} graphics hardware", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "86--97", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Eberle:1998:SQC, author = "Hans Eberle and Erwin Oertli", title = "{Switcherland}: a {QoS} communication architecture for workstation clusters", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "98--108", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Alvarez:1998:DDA, author = "Guillermo A. Alvarez and Walter A. Burkhard and Larry J. Stockmeyer and Flaviu Cristian", title = "Declustered disk array architectures with optimal and near-optimal parallelism", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "109--120", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Grunwald:1998:CES, author = "Dirk Grunwald and Artur Klauser and Srilatha Manne and Andrew Pleszkun", title = "Confidence estimation for speculation control", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "122--131", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Manne:1998:PGS, author = "Srilatha Manne and Artur Klauser and Dirk Grunwald", title = "Pipeline gating: speculation control for energy reduction", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "132--141", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chrysos:1998:MDP, author = "George Z. Chrysos and Joel S. Emer", title = "Memory dependence prediction using store sets", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "142--153", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Juan:1998:DHL, author = "Toni Juan and Sanji Sanjeevan and Juan J. Navarro", title = "Dynamic history-length fitting: a third level of adaptivity for branch prediction", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "155--166", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Driesen:1998:AIB, author = "Karel Driesen and Urs H{\"o}lzle", title = "Accurate indirect branch prediction", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "167--178", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mukherjee:1998:UPA, author = "Shubhendu S. Mukherjee and Mark D. Hill", title = "Using prediction to accelerate coherence protocols", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "179--190", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Oskin:1998:APC, author = "Mark Oskin and Frederic T. Chong and Timothy Sherwood", title = "Active pages: a computation model for intelligent memory", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "192--203", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Swanson:1998:ITR, author = "Mark Swanson and Leigh Stoller and John Carter", title = "Increasing {TLB} reach using superpages backed by shadow memory", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "204--213", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Qiu:1998:ODA, author = "Xiaogang Qiu and Michel Dubois", title = "Options for dynamic address translation in {COMAs}", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "214--225", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{August:1998:IPS, author = "David I. August and Daniel A. Connors and Scott A. Mahlke and John W. Sias and Kevin M. Crozier and Ben-Chung Cheng and Patrick R. Eaton and Qudus B. Olaniran and Wen-mei W. Hwu", title = "Integrated predicated and speculative execution in the {IMPACT EPIC} architecture", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "227--237", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wallace:1998:TMP, author = "Steven Wallace and Brad Calder and Dean M. Tullsen", title = "Threaded multiple path execution", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "238--249", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Klauser:1998:SEE, author = "Artur Klauser and Abhijit Paithankar and Dirk Grunwald", title = "Selective eager execution on the {PolyPath} architecture", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "250--259", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Patel:1998:ITC, author = "Sanjay Jeram Patel and Marius Evers and Yale N. Patt", title = "Improving trace cache effectiveness with branch promotion and trace packing", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "262--271", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gabbay:1998:EIF, author = "Freddy Gabbay and Avi Mendelson", title = "The effect of instruction fetch bandwidth on value prediction", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "272--281", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Albonesi:1998:DIC, author = "David H. Albonesi", title = "Dynamic {IPC\slash clock} rate optimization", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "282--292", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhang:1998:PMC, author = "Yinong Zhang and George B. {Adams III}", title = "Performance modeling and code partitioning for the {DS} architecture", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "293--304", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Keckler:1998:EFG, author = "Stephen W. Keckler and William J. Dally and Daniel Maskit and Nicholas P. Carter and Andrew Chang and Whay S. Lee", title = "Exploiting fine-grain thread level parallelism on the {MIT} multi-{ALU} processor", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "306--317", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Abandah:1998:EAT, author = "Gheith A. Abandah and Edward S. Davidson", title = "Effects of architectural and technological advances on the {HP\slash Convex Exemplar}'s memory and communication performance", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "318--329", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Blumrich:1998:DCS, author = "Matthias A. Blumrich and Richard D. Alpert and Yuqun Chen and Douglas W. Clark and Stefanos N. Damianakis and Cezary Dubnicki and Edward W. Felten and Liviu Iftode and Kai Li and Margaret Martonosi and Robert A. Shillner", title = "Design choices in the {SHRIMP} system: an empirical study", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "330--341", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Soundararajan:1998:FUM, author = "Vijayaraghavan Soundararajan and Mark Heinrich and Ben Verghese and Kourosh Gharachorloo and Anoop Gupta and John Hennessy", title = "Flexible use of memory for replication\slash migration in cache-coherent {DSM} multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "342--355", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kumar:1998:ESL, author = "Sanjeev Kumar and Christopher Wilkerson", title = "Exploiting spatial locality in data caches using spatial footprints", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "357--368", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lynch:1998:LLL, author = "William L. Lynch and Gary Lauterbach and Joseph I. Chamdani", title = "Low load latency through sum-addressed memory {(SAM)}", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "369--379", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sorin:1998:AES, author = "Daniel J. Sorin and Vijay S. Pai and Sarita V. Adve and Mary K. Vernon and David A. Wood", title = "Analytic evaluation of shared-memory systems with {ILP} processors", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "380--391", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Golla:1998:CEB, author = "Prasad N. Golla and Eric C. Lin", title = "A comparison of the effect of branch prediction on multithreaded and scalar architectures", journal = j-COMP-ARCH-NEWS, volume = "26", number = "4", pages = "3--11", month = sep, year = "1998", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1216475.1216476", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:40 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Speculative instructions execution requires dynamic branch predictors to increase the performance of a processor by executing from predicted branch target routines. Conventional Scalar architectures such as the Superscalar or Multiscalar architecture executes from a single stream, while a Multithreaded architecture executes from multiple streams at a time. Several aggressive branch predictors have been proposed with high prediction accuracies. Unfortunately, none of the branch predictors can provide 100\% accuracy. Therefore, there is an inherent limitation on speculative execution in real implementation. In this paper, we show that Multithreaded architecture is a better candidate for utilizing speculative execution than Scalar architectures. Generally the branch prediction performance degradation is compounded for larger window sizes on Scalar architectures, while for a Multithreaded architecture, by increasing the number of executing threads, we could sustain a higher performance for a large aggregated speculative window size. Hence, heavier workloads may increase performance and utilization for Multithreaded architectures. We present analytical and simulation results to support our argument.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1998:INc, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "26", number = "4", pages = "12--16", month = sep, year = "1998", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1216475.1216477", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:40 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This column consists of selected traffic from the comp.arch newsgroup, a forum for discussion of computer architecture on Internet---an international computer network. As always, the opinions expressed in this column are the personal views of the authors, and do not necessarily represent the institutions to which they are affiliated. Text which sets the context of a message appears in italics; this is usually text the author has quoted from earlier messages. The code-like expressions below the authors' names are their addresses on Internet.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Machanick:1998:SVL, author = "Philip Machanick", title = "Streaming vs. latency in information mass-transit", journal = j-COMP-ARCH-NEWS, volume = "26", number = "5", pages = "4--6", month = dec, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:21 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lafitte:1998:GMD, author = "Jean-Louis Lafitte", title = "A generalized mapping device to help memory latency", journal = j-COMP-ARCH-NEWS, volume = "26", number = "5", pages = "7--13", month = dec, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:21 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ashraf:1998:IRM, author = "Farooq Ashraf and Mostafa Abd-El-Barr and Khalid Al-Tawil", title = "Introduction to routing in multicomputer networks", journal = j-COMP-ARCH-NEWS, volume = "26", number = "5", pages = "14--21", month = dec, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:21 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wilmot:1998:DTM, author = "Dick Wilmot", title = "Data threaded microarchitecture", journal = j-COMP-ARCH-NEWS, volume = "26", number = "5", pages = "22--32", month = dec, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:21 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yuen:1999:SR, author = "C. K. Yuen", title = "Stack and {RISC}", journal = j-COMP-ARCH-NEWS, volume = "27", number = "1", pages = "3--9", month = mar, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Baylor:1999:USS, author = "Sandra Johnson Baylor", title = "Unified scalable shared memory architectures", journal = j-COMP-ARCH-NEWS, volume = "27", number = "1", pages = "10--21", month = mar, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{DeWitt:1999:PTL, author = "Anthony DeWitt and Thomas Gross", title = "The potential of thread-level speculation based on value profiling", journal = j-COMP-ARCH-NEWS, volume = "27", number = "1", pages = "22--22", month = mar, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kalamatianos:1999:IAI, author = "John Kalamatianos and David R. Kaeli", title = "Improving the accuracy of indirect branch prediction via branch classification", journal = j-COMP-ARCH-NEWS, volume = "27", number = "1", pages = "23--26", month = mar, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ju:1999:PMD, author = "Roy Dz-ching Ju and Jean-Fran{\c{c}}ois Collard and Karim Oukbir", title = "Probabilistic memory disambiguation and its application to data speculation", journal = j-COMP-ARCH-NEWS, volume = "27", number = "1", pages = "27--30", month = mar, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Postiff:1999:LIL, author = "Matthew A. Postiff and David A. Greene and Gary S. Tyson and Trevor N. Mudge", title = "The limits of instruction level parallelism in {SPEC95} applications", journal = j-COMP-ARCH-NEWS, volume = "27", number = "1", pages = "31--34", month = mar, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yang:1999:LMJ, author = "Byung-Sun Yang and Junpyo Lee and Jinpyo Park and Soo-Mook Moon and Kemal Ebcio{\u{g}}lu and Erik Altman", title = "Lightweight monitor for {Java VM}", journal = j-COMP-ARCH-NEWS, volume = "27", number = "1", pages = "35--38", month = mar, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rao:1999:SAU, author = "Amit Rao and Santosh Pande", title = "Storage assignment using expression tree transformations to generate compact and efficient {DSP} code", journal = j-COMP-ARCH-NEWS, volume = "27", number = "1", pages = "39--42", month = mar, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Flautner:1999:HLS, author = "Kriszti{\'a}n Flautner and Gary S. Tyson and Trevor Mudge", title = "A high level simulator integrated with the {Mirv} compiler", journal = j-COMP-ARCH-NEWS, volume = "27", number = "1", pages = "43--46", month = mar, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Casse:1999:UAI, author = "H. Cass{\'e} and L. F{\'e}raud and C. Rochange and P. Sainrat", title = "Using the abstract interpretation technique for static pointer analysis", journal = j-COMP-ARCH-NEWS, volume = "27", number = "1", pages = "47--50", month = mar, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bahar:1999:CSC, author = "Iris Bahar and Brad Calder and Dirk Grunwald", title = "A comparison of software code reordering and victim buffers", journal = j-COMP-ARCH-NEWS, volume = "27", number = "1", pages = "51--54", month = mar, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Carr:1999:ISP, author = "Steve Carr and Philip Sweany", title = "Improving software pipelining with hardware support for self-spatial loads", journal = j-COMP-ARCH-NEWS, volume = "27", number = "1", pages = "55--58", month = mar, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Barua:1999:MCM, author = "Rajeev Barua and Walter Lee and Saman Amarasinghe and Anant Agarwal", title = "{Maps}: a compiler-managed memory system for raw machines", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "4--15", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vajapeyam:1999:DVM, author = "Sriram Vajapeyam and P. J. Joseph and Tulika Mitra", title = "Dynamic vectorization: a mechanism for exploiting far-flung {ILP} in ordinary programs", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "16--27", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Goldstein:1999:PCP, author = "Seth Copen Goldstein and Herman Schmit and Matthew Moe and Mihai Budiu and Srihari Cadambi and R. Reed Taylor and Ronald Laufer", title = "{PipeRench}: a co\slash processor for streaming multimedia acceleration", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "28--39", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yoaz:1999:STI, author = "Adi Yoaz and Mattan Erez and Ronny Ronen and Stephan Jourdan", title = "Speculation techniques for improving load related instruction scheduling", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "42--53", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bekerman:1999:CLA, author = "Michael Bekerman and Stephan Jourdan and Ronny Ronen and Gilad Kirshenboim and Lihu Rappoport and Adi Yoaz and Uri Weiser", title = "Correlated load-address predictors", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "54--63", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Calder:1999:SVP, author = "Brad Calder and Glenn Reinman and Dean M. Tullsen", title = "Selective value prediction", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "64--74", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Qiu:1999:TLM, author = "Xiaogang Qiu and Michel Dubois", title = "Tolerating late memory traps in {ILP} processors", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "76--87", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Luk:1999:MFE, author = "Chi-Keung Luk and Todd C. Mowry", title = "Memory forwarding: enabling aggressive layout optimizations by guaranteeing the safety of data relocation", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "88--99", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cho:1999:DLV, author = "Sangyeun Cho and Pen-Chung Yew and Gyungho Lee", title = "Decoupling local variable accesses in a wide-issue superscalar processor", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "100--110", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Roth:1999:EJP, author = "Amir Roth and Gurindar S. Sohi", title = "Effective jump-pointer prefetching for linked data structures", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "111--121", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ranganathan:1999:PIV, author = "Parthasarathy Ranganathan and Sarita Adve and Norman P. Jouppi", title = "Performance of image and video processing with general-purpose processors and media {ISA} extensions", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "124--135", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Merten:1999:HDP, author = "Matthew C. Merten and Andrew R. Trick and Christopher N. George and John C. Gyllenhaal and Wen-mei W. Hwu", title = "A hardware-driven profiling scheme for identifying program hot spots to support runtime optimization", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "136--147", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shen:1999:CRF, author = "Xiaowei Shen and Arvind and Larry Rudolph", title = "Commit-reconcile \& fences {(CRF)}: a new memory model for architects and compiler writers", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "150--161", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gniady:1999:SIR, author = "Chris Gniady and Babak Falsafi and T. N. Vijaykumar", title = "Is {SC + ILP = RC}?", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "162--171", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "Instruction level parallelism (ILP); release consistency (RC); sequential consistency (SC)", } @Article{Lai:1999:MSP, author = "An-Chow Lai and Babak Falsafi", title = "Memory sharing predictor: the key to a speculative coherent {DSM}", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "172--183", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chappell:1999:SSM, author = "Robert S. Chappell and Jared Stark and Sangwook P. Kim and Steven K. Reinhardt and Yale N. Patt", title = "Simultaneous subordinate microthreading {(SSMT)}", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "186--195", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Black:1999:BBT, author = "Bryan Black and Bohuslav Rychlik and John Paul Shen", title = "The block-based trace cache", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "196--207", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{August:1999:PDL, author = "David I. August and John W. Sias and Jean-Michel Puiatti and Scott A. Mahlke and Daniel A. Connors and Kevin M. Crozier and Wen-mei W. Hwu", title = "The program decision logic approach to predicated execution", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "208--219", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cuppu:1999:PCC, author = "Vinodh Cuppu and Bruce Jacob and Brian Davis and Trevor Mudge", title = "A performance comparison of contemporary {DRAM} architectures", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "222--233", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Reinman:1999:SFE, author = "Glenn Reinman and Todd Austin and Brad Calder", title = "A scalable front-end architecture for fast instruction delivery", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "234--245", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kim:1999:AEA, author = "Seongwoo Kim and Arun K. Somani", title = "Area efficient architectures for information integrity in cache memories", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "246--255", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nakra:1999:VPV, author = "Tarun Nakra and Rajiv Gupta and Mary Lou Soffa", title = "Value prediction in {VLIW} machines", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "258--269", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tullsen:1999:SVP, author = "Dean M. Tullsen and John S. Seng", title = "Storageless value prediction using prior register values", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "270--279", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bilas:1999:UNI, author = "Angelos Bilas and Cheng Liao and Jaswinder Pal Singh", title = "Using network interface support to avoid asynchronous protocol processing in shared virtual memory systems", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "282--293", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bilir:1999:MSN, author = "E. Ender Bilir and Ross M. Dickson and Ying Hu and Manoj Plakal and Daniel J. Sorin and Mark D. Hill and David A. Wood", title = "Multicast snooping: a new coherence method using a multicast address network", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "294--304", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jiang:1999:SAP, author = "Dongming Jiang and Jaswinder Pal Singh", title = "Scaling application performance on a cache-coherent multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "305--316", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:1999:MSF, author = "Anonymous", title = "In memoriam---{SIGARCH} founder: {Caxton C. Foster}", journal = j-COMP-ARCH-NEWS, volume = "27", number = "3", pages = "1--3", month = jun, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:59 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hwang:1999:SSI, author = "Seung H. Hwang and Gwan S. Choi", title = "Selective-set-invalidation {(SSI)} for soft-error-resilient cache architecture", journal = j-COMP-ARCH-NEWS, volume = "27", number = "3", pages = "4--9", month = jun, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:59 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cheng:1999:DHP, author = "Peng Cheng and Hai Jin and Jiangling Zhang", title = "Design of high performance {RAID} in real-time system", journal = j-COMP-ARCH-NEWS, volume = "27", number = "3", pages = "10--17", month = jun, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:59 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yuen:1999:ASC, author = "C. K. Yuen", title = "Architectural support for the cache based vector computation", journal = j-COMP-ARCH-NEWS, volume = "27", number = "3", pages = "18--23", month = jun, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:59 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Driker:1999:DCC, author = "Benjamin Driker", title = "Disbursed control computer architecture", journal = j-COMP-ARCH-NEWS, volume = "27", number = "3", pages = "24--31", month = jun, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:59 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Khalid:1999:PEM, author = "Humayun Khalid", title = "Performance evaluation of multimedia systems with {MPEG-2} bitstreams", journal = j-COMP-ARCH-NEWS, volume = "27", number = "3", pages = "32--37", month = jun, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:59 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Khalid:1999:MPE, author = "Humayun Khalid", title = "A methodology for performance evaluation of systems with large emulation code", journal = j-COMP-ARCH-NEWS, volume = "27", number = "3", pages = "38--42", month = jun, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:59 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Khalid:1999:TMB, author = "Humayun Khalid", title = "Tracing multimedia benchmarks with five degrees of validation", journal = j-COMP-ARCH-NEWS, volume = "27", number = "3", pages = "43--48", month = jun, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:59 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Khalid:1999:PET, author = "Humayun Khalid", title = "Performance evaluation of two operating systems", journal = j-COMP-ARCH-NEWS, volume = "27", number = "3", pages = "49--52", month = jun, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:59 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1999:INa, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "27", number = "3", pages = "53--60", month = jun, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:59 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Machanick:1999:CRA, author = "Phillip Machanick", title = "Correction to {RAMpage ASPOLOS} paper", journal = j-COMP-ARCH-NEWS, volume = "27", number = "4", pages = "2--5", month = sep, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:14 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shahhoseini:1999:ABP, author = "H. S. Shahhoseini and M. Naderi and S. Nemati", title = "Achieving the best performance on superscalar processors", journal = j-COMP-ARCH-NEWS, volume = "27", number = "4", pages = "6--11", month = sep, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:14 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1999:INb, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "27", number = "4", pages = "12--14", month = sep, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:14 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Torrant:1999:SMS, author = "Marc Torrant and Muhammad Shaaban and Roy Czernikowski and Ken Hsu", title = "A simultaneous multithreading simulator", journal = j-COMP-ARCH-NEWS, volume = "27", number = "5", pages = "1--5", month = dec, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:1999:INc, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "27", number = "5", pages = "6--10", month = dec, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dai:2000:LSO, author = "Min Dai and Christine Eisenbeis and Sid-Ahmed-Ali Touati", title = "Load-store optimization for software pipelining", journal = j-COMP-ARCH-NEWS, volume = "28", number = "1", pages = "3--10", month = mar, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Clauss:2000:AML, author = "Philippe Clauss and Beno{\^\i}t Meister", title = "Automatic memory layout transformations to optimize spatial locality in parameterized loop nests", journal = j-COMP-ARCH-NEWS, volume = "28", number = "1", pages = "11--19", month = mar, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kreaseck:2000:LTB, author = "Barbara Kreaseck and Dean Tullsen and Brad Calder", title = "Limits of task-based parallelism in irregular applications", journal = j-COMP-ARCH-NEWS, volume = "28", number = "1", pages = "20--20", month = mar, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:2000:RVC, author = "Junpyo Lee and Byung-Sun Yang and Suhyun Kim and Kemal Ebcio{\u{g}}lu and Erik Altman and Seungil Lee and Yoo C. Chung and Heungbok Lee and Je Hyung Lee and Soo-Mook Moon", title = "Reducing virtual call overheads in a {Java VM} just-in-time compiler", journal = j-COMP-ARCH-NEWS, volume = "28", number = "1", pages = "21--33", month = mar, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sadler:2000:APE, author = "Chris Sadler and Sandeep K. S. Gupta and Rohit Bhatia", title = "Applying predication to efficiently handle runtime class testing", journal = j-COMP-ARCH-NEWS, volume = "28", number = "1", pages = "34--42", month = mar, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bermudo:2000:OCM, author = "Nerina Bermudo and Xavier Vera and Antonio Gonz{\'a}lez and Josep Llosa", title = "Optimizing cache miss equations polyhedra", journal = j-COMP-ARCH-NEWS, volume = "28", number = "1", pages = "43--52", month = mar, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Unger:2000:CCA, author = "A. Unger and E. Zehendner and Th. Ungerer", title = "A combined compiler and architecture technique to control multithreaded execution of branches and loop iterations", journal = j-COMP-ARCH-NEWS, volume = "28", number = "1", pages = "53--61", month = mar, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Aydin:2000:UCL, author = "Hakan Aydin and David Kaeli", title = "Using cache line coloring to perform aggressive procedure inlining", journal = j-COMP-ARCH-NEWS, volume = "28", number = "1", pages = "62--71", month = mar, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tyagi:2000:COP, author = "Akhilesh Tyagi and Gyungho Lee", title = "A compiler optimization paradigm for dynamic energy management", journal = j-COMP-ARCH-NEWS, volume = "28", number = "1", pages = "72--76", month = mar, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2000:INa, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "28", number = "1", pages = "77--78", month = mar, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Steffan:2000:SAT, author = "J. Greggory Steffan and Christopher B. Colohan and Antonia Zhai and Todd C. Mowry", title = "A scalable approach to thread-level speculation", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "1--12", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cintra:2000:ASS, author = "Marcelo Cintra and Jos{\'e} F. Mart{\'\i}nez and Josep Torrellas", title = "Architectural support for scalable speculative parallelization in shared-memory multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "13--24", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Reinhardt:2000:TFD, author = "Steven K. Reinhardt and Shubhendu S. Mukherjee", title = "Transient fault detection via simultaneous multithreading", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "25--36", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jacobson:2000:TP, author = "Quinn Jacobson and James E. Smith", title = "Trace preconstruction", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "37--46", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rakvic:2000:CTM, author = "Ryan Rakvic and Bryan Black and John Paul Shen", title = "Completion time multiple branch prediction for enhancing trace cache performance", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "47--58", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Merten:2000:HMD, author = "Matthew C. Merten and Andrew R. Trick and Erik M. Nystrom and Ronald D. Barnes and Wen-mei W. Hmu", title = "A hardware mechanism for dynamic extraction and relayout of program hot spots", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "59--70", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Oskin:2000:HCS, author = "Mark Oskin and Frederic T. Chong and Matthew Farrens", title = "{HLS}: combining statistical and symbolic simulation to guide microprocessor designs", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "71--82", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Brooks:2000:WFA, author = "David Brooks and Vivek Tiwari and Margaret Martonosi", title = "{Wattch}: a framework for architectural-level power analysis and optimizations", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "83--94", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vijaykrishnan:2000:EDI, author = "N. Vijaykrishnan and M. Kandemir and M. J. Irwin and H. S. Kim and W. Ye", title = "Energy-driven integrated hardware-software optimizations using {SimplePower}", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "95--106", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hallnor:2000:FAS, author = "Erik G. Hallnor and Steven K. Reinhardt", title = "A fully associative software-managed cache design", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "107--116", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Saulsbury:2000:RBT, author = "Ashley Saulsbury and Fredrik Dahlgren and Per Stenstr{\"o}m", title = "Recency-based {TLB} preloading", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "117--127", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rixner:2000:MAS, author = "Scott Rixner and William J. Dally and Ujval J. Kapasi and Peter Mattson and John D. Owens", title = "Memory access scheduling", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "128--138", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lai:2000:SAT, author = "An-Chow Lai and Babak Falsafi", title = "Selective, accurate, and timely self-invalidation using last-touch prediction", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "139--148", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Margolus:2000:EDA, author = "Norman Margolus", title = "An embedded {DRAM} architecture for large-scale spatial-lattice computations", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "149--160", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mai:2000:SMM, author = "Ken Mai and Tim Paaske and Nuwan Jayasena and Ron Ho and William J. Dally and Mark Horowitz", title = "Smart {Memories}: a modular reconfigurable architecture", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "161--171", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zilles:2000:UBS, author = "Craig B. Zilles and Gurindar S. Sohi", title = "Understanding the backward slices of performance degrading instructions", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "172--181", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lepak:2000:VLS, author = "Kevin M. Lepak and Mikko H. Lipasti", title = "On the value locality of store instructions", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "182--191", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cvetanovic:2000:PAA, author = "Zarka Cvetanovic and R. E. Kessler", title = "Performance analysis of the {Alpha 21264}-based {Compaq ES40} system", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "192--202", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Faraboschi:2000:LTP, author = "Paolo Faraboschi and Geoffrey Brown and Joseph A. Fisher and Giuseppe Desoli and Fred Homewood", title = "{Lx}: a technology platform for customizable {VLIW} embedded processing", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "203--213", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ranganathan:2000:RCT, author = "Parthasarathy Ranganathan and Sarita Adve and Norman P. Jouppi", title = "Reconfigurable caches and their application to media processing", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "214--224", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ye:2000:CHP, author = "Zhi Alex Ye and Andreas Moshovos and Scott Hauck and Prithviraj Banerjee", title = "{CHIMAERA}: a high-performance architecture with a tightly-coupled reconfigurable functional unit", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "225--235", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Henry:2000:CWW, author = "Dana S. Henry and Bradley C. Kuszmaul and Gabriel H. Loh and Rahul Sami", title = "Circuits for wide-window superscalar processors", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "236--247", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Agarwal:2000:CRV, author = "Vikas Agarwal and M. S. Hrishikesh and Stephen W. Keckler and Doug Burger", title = "Clock rate versus {IPC}: the end of the road for conventional microarchitectures", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "248--259", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smith:2000:VIS, author = "J. E. Smith and Greg Faanes and Rabin Sugumar", title = "Vector instruction set support for conditional operations", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "260--269", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chou:2000:IPC, author = "Yuan Chou and John Paul Shen", title = "Instruction path coprocessors", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "270--281", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Barroso:2000:PSA, author = "Luiz Andr{\'e} Barroso and Kourosh Gharachorloo and Robert McNamara and Andreas Nowatzyk and Shaz Qadeer and Barton Sano and Scott Smith and Robert Stets and Ben Verghese", title = "{Piranha}: a scalable architecture based on single-chip multiprocessing", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "282--293", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Radhakrishnan:2000:AIE, author = "Ramesh Radhakrishnan and Deependra Talla and Lizy Kurian John", title = "Allowing for {ILP} in an embedded {Java} processor", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "294--305", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bekerman:2000:ELA, author = "Michael Bekerman and Adi Yoaz and Freddy Gabbay and Stephan Jourdan and Maxim Kalaev and Ronny Ronen", title = "Early load address resolution via register tracking", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "306--315", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cruz:2000:MBR, author = "Jos{\'e}-Lorenzo Cruz and Antonio Gonz{\'a}lez and Mateo Valero and Nigel P. Topham", title = "Multiple-banked register file architectures", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "316--325", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fernandez:2000:EPN, author = "Benjam{\'\i}n Sahelices Fern{\'a}ndez and Diego R. Llanos Ferraris and Agust{\'\i}n de Dios Hern{\'a}ndez", title = "Exploiting parallelism in a network of workstations using {COMA-BC}", journal = j-COMP-ARCH-NEWS, volume = "28", number = "3", pages = "1--8", month = jun, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:59 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2000:INb, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "28", number = "3", pages = "9--13", month = jun, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:59 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lafitte:2000:RDH, author = "Jean-Louis Lafitte", title = "Regarding a device to help battering the {RAM} wall", journal = j-COMP-ARCH-NEWS, volume = "28", number = "4", pages = "4--10", month = sep, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:14 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Petit:2000:LSE, author = "S. Petit and J. A. Gil and J. Sahuquillo and A. Pont", title = "{LIDE}: a simulation environment for shared virtual memory systems", journal = j-COMP-ARCH-NEWS, volume = "28", number = "4", pages = "11--18", month = sep, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:14 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Schlosser:2000:DCS, author = "Steven W. Schlosser and John Linwood Griffin and David F. Nagle and Gregory R. Ganger", title = "Designing computer systems with {MEMS}-based storage", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "1--12", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gharachorloo:2000:ADA, author = "Kourosh Gharachorloo and Madhu Sharma and Simon Steely and Stephen {Van Doren}", title = "Architecture and design of {AlphaServer GS320}", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "13--24", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Martin:2000:TSA, author = "Milo M. K. Martin and Daniel J. Sorin and Anastassia Ailamaki and Alaa R. Alameldeen and Ross M. Dickson and Carl J. Mauer and Kevin E. Moore and Manoj Plakal and Mark D. Hill and David A. Wood", title = "Timestamp snooping: an approach for extending {SMPs}", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "25--36", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nanda:2000:MPR, author = "Ashwini Nanda and Kwok-Ken Mak and Krishnan Sugarvanam and Ramendra K. Sahoo and Vijayaraghavan Soundararajan and T. Basil Smith", title = "{MemorIES3}: a programmable, real-time hardware emulation tool for multiprocessor server design", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "37--48", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gibson:2000:FVS, author = "Jeff Gibson and Robert Kunz and David Ofelt and Mark Horowitz and John Hennessy and Mark Heinrich", title = "{FLASH} vs. {(Simulated) FLASH}: closing the simulation loop", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "49--58", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chou:2000:UML, author = "Andy Chou and Benjamin Chelf and Dawson Engler and Mark Heinrich", title = "Using meta-level compilation to check {FLASH} protocol code", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "59--70", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bhoedjang:2000:EDA, author = "Raoul A. F. Bhoedjang and Kees Verstoep and Tim R{\"u}hl and Henri E. Bal and Rutger F. H. Hofman", title = "Evaluating design alternatives for reliable communication on high-speed networks", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "71--81", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mattson:2000:CS, author = "Peter Mattson and William J. Dally and Scott Rixner and Ujval J. Kapasi and John D. Owens", title = "Communication scheduling", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "82--92", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hill:2000:SAD, author = "Jason Hill and Robert Szewczyk and Alec Woo and Seth Hollar and David Culler and Kristofer Pister", title = "System architecture directions for networked sensors", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "93--104", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lebeck:2000:PAP, author = "Alvin R. Lebeck and Xiaobo Fan and Heng Zeng and Carla Ellis", title = "Power aware page allocation", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "105--116", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Berger:2000:HSM, author = "Emery D. Berger and Kathryn S. McKinley and Robert D. Blumofe and Paul R. Wilson", title = "{Hoard}: a scalable memory allocator for multithreaded applications", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "117--128", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Flautner:2000:TLP, author = "Kristi{\'a}n Flautner and Rich Uhlig and Steve Reinhardt and Trevor Mudge", title = "Thread-level parallelism and interactive performance of desktop applications", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "129--138", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kawahito:2000:ENP, author = "Motohiro Kawahito and Hideaki Komatsu and Toshio Nakatani", title = "Effective null pointer check elimination utilizing hardware trap", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "139--149", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhang:2000:FVL, author = "Youtao Zhang and Jun Yang and Rajiv Gupta", title = "Frequent value locality and value-centric data cache design", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "150--159", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Burrows:2000:EFV, author = "M. Burrows and U. Erlingson and S-T. A. Leung and M. T. Vandevoorde and C. A. Waldspurger and K. Walker and W. E. Weihl", title = "Efficient and flexible value sampling", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "160--167", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thekkath:2000:ASC, author = "David Lie Chandramohan Thekkath and Mark Mitchell and Patrick Lincoln and Dan Boneh and John Mitchell and Mark Horowitz", title = "Architectural support for copy and tamper resistant software", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "168--177", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Burke:2000:ASF, author = "Jerome Burke and John McDonald and Todd Austin", title = "Architectural support for fast symmetric-key cryptography", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "178--189", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kubiatowicz:2000:OAG, author = "John Kubiatowicz and David Bindel and Yan Chen and Steven Czerwinski and Patrick Eaton and Dennis Geels and Ramakrishna Gummadi and Sean Rhea and Hakim Weatherspoon and Chris Wells and Ben Zhao", title = "{OceanStore}: an architecture for global-scale persistent storage", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "190--201", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Duesterwald:2000:SPH, author = "Evelyn Duesterwald and Vasanth Bala", title = "Software profiling for hot path prediction: less is more", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "202--211", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zahir:2000:CCD, author = "Rumi Zahir and Jonathan Ross and Dale Morris and Drew Hess", title = "{OS} and compiler considerations in the design of the {IA-64} architecture", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "212--221", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Connors:2000:HSD, author = "Daniel A. Connors and Hillery C. Hunter and Ben-Chung Cheng and Wen-mei W. Hwu", title = "Hardware support for dynamic activation of compiler-directed computation reuse", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "222--233", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Snavely:2000:SJS, author = "Allan Snavely and Dean M. Tullsen", title = "Symbiotic job scheduling for a simultaneous multithreaded processor", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "234--244", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Redstone:2000:AOS, author = "Joshua A. Redstone and Susan J. Eggers and Henry M. Levy", title = "An analysis of operating system behavior on a simultaneous multithreaded architecture", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "245--256", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sundaramoorthy:2000:SPI, author = "Karthik Sundaramoorthy and Zach Purser and Eric Rotenburg", title = "Slipstream processors: improving both performance and fault tolerance", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "257--268", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wilkes:2001:MGF, author = "Maurice V. Wilkes", title = "The memory gap and the future of high performance memories", journal = j-COMP-ARCH-NEWS, volume = "29", number = "1", pages = "2--7", month = mar, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Manjikian:2001:MESa, author = "Naraig Manjikian", title = "Multiprocessor enhancements of the {SimpleScalar} tool set", journal = j-COMP-ARCH-NEWS, volume = "29", number = "1", pages = "8--15", month = mar, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wang:2001:MAH, author = "Frank Wang", title = "A modified architecture for high-density {MRAM}", journal = j-COMP-ARCH-NEWS, volume = "29", number = "1", pages = "16--22", month = mar, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Altman:2001:WWB, author = "Erik R. Altman and David Kaeli", title = "{WBT-2000}: {Workshop on Binary Translation 2000}", journal = j-COMP-ARCH-NEWS, volume = "29", number = "1", pages = "23--25", month = mar, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Srivastava:2001:EOB, author = "Amitabh Srivastava", title = "Emerging opportunities for binary tools", journal = j-COMP-ARCH-NEWS, volume = "29", number = "1", pages = "26--26", month = mar, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cain:2001:DBT, author = "Harold W. Cain and Kevin M. Lepak and Mikko H. Lipasti", title = "A dynamic binary translation approach to architectural simulation", journal = j-COMP-ARCH-NEWS, volume = "29", number = "1", pages = "27--36", month = mar, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hilgendorf:2001:ITE, author = "Rolf Hilgendorf and Wolfram Sauer", title = "Instruction translation for an experimental {S/390} processor", journal = j-COMP-ARCH-NEWS, volume = "29", number = "1", pages = "37--42", month = mar, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ronsse:2001:JRJ, author = "Michiel Ronsse and Koen {De Bosschere}", title = "{JiTI}: a robust just in time instrumentation technique", journal = j-COMP-ARCH-NEWS, volume = "29", number = "1", pages = "43--54", month = mar, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ung:2001:OHP, author = "David Ung and Cristina Cifuentes", title = "Optimising hot paths in a dynamic binary translator", journal = j-COMP-ARCH-NEWS, volume = "29", number = "1", pages = "55--65", month = mar, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gschwind:2001:OPE, author = "Michael Gschwind and Erik Altman", title = "Optimization and precise exceptions in dynamic compilation", journal = j-COMP-ARCH-NEWS, volume = "29", number = "1", pages = "66--74", month = mar, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2001:INa, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "29", number = "1", pages = "75--77", month = mar, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zilles:2001:EBP, author = "Craig Zilles and Gurindar Sohi", title = "Execution-based prediction using speculative slices", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "2--13", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Collins:2001:SPL, author = "Jamison D. Collins and Hong Wang and Dean M. Tullsen and Christopher Hughes and Yong-Fong Lee and Dan Lavery and John P. Shen", title = "Speculative precomputation: long-range prefetching of delinquent loads", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "14--25", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Balasubramonian:2001:DAP, author = "Rajeev Balasubramonian and Sandhya Dwarkadas and David H. Albonesi", title = "Dynamically allocating processor resources between nearby and distant {ILP}", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "26--37", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Luk:2001:TML, author = "Chi-Keung Luk", title = "Tolerating memory latency through software-controlled pre-execution in simultaneous multithreading processors", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "40--51", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Annavaram:2001:DPD, author = "Murali Annavaram and Jignesh M. Patel and Edward S. Davidson", title = "Data prefetching by dependence graph precomputation", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "52--61", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cuppu:2001:CLS, author = "Vinodh Cuppu and Bruce Jacob", title = "Concurrency, latency, or system overhead: which has the largest impact on uniprocessor {DRAM}-system performance?", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "62--71", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fields:2001:FPP, author = "Brian Fields and Shai Rubin and Rastislav Bod{\'\i}k", title = "Focusing processor policies via critical-path prediction", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "74--85", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sherwood:2001:ADF, author = "Timothy Sherwood and Brad Calder", title = "Automated design of finite state machine predictors for customized processors", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "86--97", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wu:2001:BER, author = "Youfeng Wu and Dong-Yuan Chen and Jesse Fang", title = "Better exploration of region-level value locality with integrated computation reuse and value prediction", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "98--108", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wu:2001:CFF, author = "Lisa Wu and Chris Weaver and Todd Austin", title = "{CryptoManiac}: a fast flexible architecture for secure communication", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "110--119", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yum:2001:QPC, author = "Ki Hwan Yum and Eun Jung Kim and Chita R. Das", title = "{QoS} provisioning in clusters: an investigation of {Router} and {NIC} design", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "120--129", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Srinivasan:2001:LVC, author = "Srikanth T. Srinivasan and Roy Dz-ching Ju and Alvin R. Lebeck and Chris Wilkerson", title = "Locality vs. criticality", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "132--143", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lai:2001:DBP, author = "An-Chow Lai and Cem Fide and Babak Falsafi", title = "Dead-block prediction \& dead-block correlating prefetchers", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "144--154", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ramirez:2001:CLO, author = "Alex Ramirez and Luiz Andr{\'e} Barroso and Kourosh Gharachorloo and Robert Cohn and Josep Larriba-Pey and P. Geoffrey Lowney and Mateo Valero", title = "Code layout optimizations for transaction processing workloads", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "155--164", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Niemier:2001:EEW, author = "Michael Thaddeus Niemier and Peter M. Kogge", title = "Exploring and exploiting wire-level pipelining in emerging technologies", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "166--177", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Goldstein:2001:NSC, author = "Seth Copen Goldstein and Mihai Budiu", title = "{NanoFabrics}: spatial computing using molecular electronics", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "178--191", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lie:2001:SME, author = "David Lie and Andy Chou and Dawson Engler and David L. Dill", title = "A simple method for extracting models for protocol code", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "192--203", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Prvulovic:2001:RAB, author = "Milos Prvulovic and Mar{\'\i}a Jes{\'u}s Garzar{\'a}n and Lawrence Rauchwerger and Josep Torrellas", title = "Removing architectural bottlenecks to the scalability of speculative parallelization", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "204--215", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bahar:2001:PER, author = "R. Iris Bahar and Srilatha Manne", title = "Power and energy reduction via pipeline balancing", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "218--229", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Folegnani:2001:EEI, author = "Daniele Folegnani and Antonio Gonz{\'a}lez", title = "Energy-effective issue logic", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "230--239", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kaxiras:2001:CDE, author = "Stefanos Kaxiras and Zhigang Hu and Margaret Martonosi", title = "Cache decay: exploiting generational behavior to reduce cache leakage power", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "240--251", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hughes:2001:VEM, author = "Christopher J. Hughes and Praful Kaul and Sarita V. Adve and Rohit Jain and Chanik Park and Jayanth Srinivasan", title = "Variability in the execution of multimedia applications and implications for architecture", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "254--265", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sastry:2001:RPS, author = "S. Subramanya Sastry and Rastislav Bod{\'\i}k and James E. Smith", title = "Rapid profiling via stratified sampling", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "278--289", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zilles:2001:BHC, author = "Craig B. Zilles", title = "Benchmark health considered harmful", journal = j-COMP-ARCH-NEWS, volume = "29", number = "3", pages = "4--5", month = jun, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thornock:2001:NTC, author = "Niki C. Thornock and J. Kelly Flanagan", title = "A national trace collection and distribution resource", journal = j-COMP-ARCH-NEWS, volume = "29", number = "3", pages = "6--10", month = jun, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2001:INb, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "29", number = "3", pages = "11--15", month = jun, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Manjikian:2001:MESb, author = "Naraig Manjikian", title = "More enhancements of the {SimpleScalar} tool set", journal = j-COMP-ARCH-NEWS, volume = "29", number = "4", pages = "5--12", month = sep, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:15 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cantin:2001:CPS, author = "Jason F. Cantin and Mark D. Hill", title = "Cache performance for selected {SPEC CPU2000} benchmarks", journal = j-COMP-ARCH-NEWS, volume = "29", number = "4", pages = "13--18", month = sep, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:15 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhang:2001:PLA, author = "Jinsuo Zhang", title = "The predictability of load address", journal = j-COMP-ARCH-NEWS, volume = "29", number = "4", pages = "19--28", month = sep, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:15 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2001:INc, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "29", number = "4", pages = "29--31", month = sep, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:15 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{El-Kharashi:2001:ATA, author = "M. Watheq El-Kharashi and Fayez Elguibaly and Kin F. Li", title = "Adapting {Tomasulo}'s algorithm for bytecode folding based {Java} processors", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "1--8", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bartolini:2001:PAC, author = "S. Bartolini and R. Giorgi and J. Protic and C. A. Prete and M. Valero", title = "Parallel architecture and compilation techniques: selection of workshop papers, {Guest Editors}' introduction", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "9--12", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Acquaviva:2001:ECE, author = "Andrea Acquaviva and Luca Benini and Bruno Ricc{\'o}", title = "Energy characterization of embedded real-time operating systems", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "13--18", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Moncusi:2001:IES, author = "M. Angels Moncusi and Alex Arenas and Jesus Labarta", title = "Improving energy saving in hard real time systems via a modified dual priority scheduling", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "19--24", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vahid:2001:PCP, author = "Frank Vahid and Rilesh Patel and Greg Stitt", title = "Propagating constants past software to hardware peripherals in fixed-application embedded systems", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "25--30", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Aslot:2001:PCS, author = "Vishal Aslot and Rudolf Eigenmann", title = "Performance characteristics of the {SPEC OMP2001} benchmarks", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "31--40", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bull:2001:MSO, author = "J. Mark Bull and Darragh O'Neill", title = "A microbenchmark suite for {OpenMP 2.0}", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "41--48", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nikolopoulos:2001:EMA, author = "D. S. Nikolopoulos and E. Artiaga and E. Ayguad{\'e} and J. Labarta", title = "Exploiting memory affinity in {OpenMP} through schedule reuse", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "49--55", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sung:2001:MDA, author = "Michael Sung and Ronny Krashinsky and Krste Asanovi{\'c}", title = "Multithreading decoupled architectures for complexity-effective general purpose computing", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "56--61", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Talla:2001:MDA, author = "Deependra Talla and Lizy K. John", title = "{MediaBreeze}: a decoupled architecture for accelerating multimedia applications", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "62--67", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nakajima:2001:MCS, author = "Tatsuo Nakajima", title = "A middleware component supporting flexible user interaction for networked home appliances", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "68--75", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Touzet:2001:SSE, author = "David Touzet and Jean-Marc Menaud and Fr{\'e}d{\'e}ric Weis and Paul Couderc and Michel Ban{\^a}tre", title = "{SIDE} surfer: enriching casual meetings with spontaneous information gathering", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "76--83", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Altman:2001:WBT, author = "Erik R. Altman and David R. Kaeli", title = "{Workshop on Binary Translation 2001}", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "84--85", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2001:INd, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "86--90", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Desikan:2002:EME, author = "Rajagopalan Desikan and Doug Burger and Stephen W. Keckler and Llorenc Cruz and Fernando Latorre and Antonio Gonz{\'a}lez and Mateo Valero", title = "Errata on {``Measuring Experimental Error in Microprocessor Simulation''}", journal = j-COMP-ARCH-NEWS, volume = "30", number = "1", pages = "2--4", month = mar, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chang:2002:ATI, author = "Fu-Chi Chang and Chia-Jiu Wang", title = "Architectural tradeoff in implementing {RSA} processors", journal = j-COMP-ARCH-NEWS, volume = "30", number = "1", pages = "5--11", month = mar, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Uht:2002:DEE, author = "Augustus K. Uht", title = "Disjoint {Eager Execution}: what it is \slash what it is not", journal = j-COMP-ARCH-NEWS, volume = "30", number = "1", pages = "12--14", month = mar, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2002:INa, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "30", number = "1", pages = "15--21", month = mar, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hartstein:2002:OPD, author = "A. Hartstein and Thomas R. Puzak", title = "The optimum pipeline depth for a microprocessor", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "7--13", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hrishikesh:2002:OLD, author = "M. S. Hrishikesh and Doug Burger and Norman P. Jouppi and Stephen W. Keckler and Keith I. Farkas and Premkishore Shivakumar", title = "The optimal logic depth per pipeline stage is 6 to 8 {FO4} inverter delays", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "14--24", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sprangle:2002:IPP, author = "Eric Sprangle and Doug Carmean", title = "Increasing processor performance by implementing deeper pipelines", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "25--34", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ernst:2002:EDS, author = "Dan Ernst and Todd Austin", title = "Efficient dynamic scheduling through tag elimination", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "37--46", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fields:2002:SMP, author = "Brian Fields and Rastislav Bod{\'\i}k and Mark D. Hill", title = "{Slack}: maximizing performance under technological constraints", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "47--58", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lebeck:2002:LFI, author = "Alvin R. Lebeck and Jinson Koppanalil and Tong Li and Jaidev Patwardhan and Eric Rotenberg", title = "A large, fast instruction window for tolerating cache misses", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "59--70", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kim:2002:ISM, author = "Ho-Seop Kim and James E. Smith", title = "An instruction set and microarchitecture for instruction level distributed processing", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "71--81", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vijaykumar:2002:TFR, author = "T. N. Vijaykumar and Irith Pomeranz and Karl Cheng", title = "Transient-fault recovery using simultaneous multithreading", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "87--98", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mukherjee:2002:DDE, author = "Shubhendu S. Mukherjee and Michael Kontz and Steven K. Reinhardt", title = "Detailed design and evaluation of redundant multithreading alternatives", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "99--110", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Prvulovic:2002:RCE, author = "Milos Prvulovic and Zheng Zhang and Josep Torrellas", title = "{ReVive}: cost-effective architectural support for rollback recovery in shared-memory multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "111--122", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sorin:2002:SIA, author = "Daniel J. Sorin and Milo M. K. Martin and Mark D. Hill and David A. Wood", title = "{SafetyNet}: improving the availability of shared memory multiprocessors with global checkpoint\slash recovery", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "123--134", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Heo:2002:DFG, author = "Seongmoo Heo and Kenneth Barr and Mark Hampton and Krste Asanovi{\'c}", title = "Dynamic fine-grain leakage reduction using leakage-biased bitlines", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "137--147", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Flautner:2002:DCS, author = "Kriszti{\'a}n Flautner and Nam Sung Kim and Steve Martin and David Blaauw and Trevor Mudge", title = "Drowsy caches: simple techniques for reducing leakage power", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "148--157", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Iyer:2002:PPE, author = "Anoop Iyer and Diana Marculescu", title = "Power and performance evaluation of globally asynchronous locally synchronous processors", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "158--168", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Solihin:2002:UUL, author = "Yan Solihin and Jaejin Lee and Josep Torrellas", title = "Using a user-level memory thread for correlation prefetching", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "171--182", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lewis:2002:AIM, author = "Jarrod A. Lewis and Bryan Black and Mikko H. Lipasti", title = "Avoiding initialization misses to the heap", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "183--194", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kandiraju:2002:GDT, author = "Gokul B. Kandiraju and Anand Sivasubramaniam", title = "Going the distance for {TLB} prefetching: an application-driven study", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "195--206", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hu:2002:TMS, author = "Zhigang Hu and Stefanos Kaxiras and Margaret Martonosi", title = "Timekeeping in the memory system: predicting and optimizing memory behavior", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "209--220", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kim:2002:IOD, author = "Ilhyun Kim and Mikko H. Lipasti", title = "Implementing optimizations at decode time", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "221--232", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dhodapkar:2002:MMC, author = "Ashutosh S. Dhodapkar and James E. Smith", title = "Managing multi-configuration hardware via dynamic working set analysis", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "233--244", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Buonadonna:2002:QPI, author = "Philip Buonadonna and David Culler", title = "Queue pair {IP}: a hybrid architecture for system area networks", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "247--256", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhou:2002:EVC, author = "Yuanyuan Zhou and Angelos Bilas and Suresh Jagannathan and Cezary Dubnicki and James F. Philbin and Kai Li", title = "Experiences with {VI} communication for database storage", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "257--268", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pajuelo:2002:SDV, author = "Alex Pajuelo and Antonio Gonz{\'a}lez and Mateo Valero", title = "Speculative dynamic vectorization", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "271--280", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Espasa:2002:TVE, author = "Roger Espasa and Federico Ardanaz and Joel Emer and Stephen Felix and Julio Gago and Roger Gramunt and Isaac Hernandez and Toni Juan and Geoff Lowney and Matthew Mattina and Andr{\'e} Seznec", title = "{Tarantula}: a vector extension to the {Alpha} architecture", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "281--292", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Seznec:2002:DTA, author = "Andr{\'e} Seznec and Stephen Felix and Venkata Krishnan and Yiannakis Sazeides", title = "Design tradeoffs for the {Alpha EV8} conditional branch predictor", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "295--306", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chappell:2002:DPB, author = "Robert S. Chappell and Francis Tseng and Adi Yoaz and Yale N. Patt", title = "Difficult-path branch prediction using subordinate microthreads", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "307--317", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Raasch:2002:SIQ, author = "Steven E. Raasch and Nathan L. Binkert and Steven K. Reinhardt", title = "A scalable instruction queue design using dependence chains", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "318--329", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Steele:2002:OHH, author = "Ken Steele and Jason Waterman and Eugene Weinstein", title = "The {Oxygen H21} handheld", journal = j-COMP-ARCH-NEWS, volume = "30", number = "3", pages = "3--4", month = jun, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Keen:2002:HSC, author = "Diana Keen and Frederic T. Chong", title = "Hardware-software co-design of embedded sensor-actuator networks", journal = j-COMP-ARCH-NEWS, volume = "30", number = "3", pages = "5--6", month = jun, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kondo:2002:SCC, author = "Masaaki Kondo and Motonobu Fujita and Hiroshi Nakamura", title = "Software-controlled on-chip memory for high-performance and low-power computing", journal = j-COMP-ARCH-NEWS, volume = "30", number = "3", pages = "7--8", month = jun, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sahoo:2002:SHA, author = "Ramendra K. Sahoo and Myung Bae and Jose Moreira", title = "Semi-hierarchical approach for reliability, availability, and serviceability of cellular systems", journal = j-COMP-ARCH-NEWS, volume = "30", number = "3", pages = "9--10", month = jun, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Eberle:2002:MDC, author = "Hans Eberle", title = "Monitoring and diagnosing computer systems by radio communication", journal = j-COMP-ARCH-NEWS, volume = "30", number = "3", pages = "11--12", month = jun, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thies:2002:CML, author = "William Thies and Michal Karczmarek and Michael Gordon and David Maze and Jeremy Wong and Henry Hoffmann and Matthew Brown and Saman Amarasinghe", title = "A common machine language for grid-based architectures", journal = j-COMP-ARCH-NEWS, volume = "30", number = "3", pages = "13--14", month = jun, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wang:2002:NAM, author = "Frank Wang and Na Helian and Farhi Marir", title = "A novel associative memory architecture for quick matching", journal = j-COMP-ARCH-NEWS, volume = "30", number = "3", pages = "15--16", month = jun, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Parker:2002:CUL, author = "Mike Parker", title = "A case for user-level interrupts", journal = j-COMP-ARCH-NEWS, volume = "30", number = "3", pages = "17--18", month = jun, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Burtscher:2002:IIF, author = "Martin Burtscher", title = "An improved index function for {(D)FCM} predictors", journal = j-COMP-ARCH-NEWS, volume = "30", number = "3", pages = "19--24", month = jun, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2002:INb, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "30", number = "3", pages = "25--26", month = jun, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gomez:2002:ASA, author = "I. G{\`o}mez and L. Pi{\~n}uel and M. Prieto and F. Tirado", title = "Analysis of simulation-adapted {SPEC 2000} benchmarks", journal = j-COMP-ARCH-NEWS, volume = "30", number = "4", pages = "4--10", month = sep, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:15 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2002:INc, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "30", number = "4", pages = "11--16", month = sep, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:15 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Estrin:2002:KAS, author = "Deborah Estrin", title = "Keynote address: {Sensor} network research: emerging challenges for architecture, systems, and languages", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "1--4", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rajwar:2002:TLF, author = "Ravi Rajwar and James R. Goodman", title = "Transactional lock-free execution of lock-based programs", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "5--17", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Martinez:2002:SSA, author = "Jos{\'e} F. Mart{\'\i}nez and Josep Torrellas", title = "Speculative synchronization: applying thread-level speculation to explicitly parallel applications", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "18--29", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lepak:2002:TSS, author = "Kevin M. Lepak and Mikko H. Lipasti", title = "Temporally silent stores", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "30--41", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sherwood:2002:ACL, author = "Timothy Sherwood and Erez Perelman and Greg Hamerly and Brad Calder", title = "Automatically characterizing large scale program behavior", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "45--57", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ogata:2002:BFO, author = "Kazunori Ogata and Hideaki Komatsu and Toshio Nakatani", title = "Bytecode fetch optimization for a {Java} interpreter", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "58--67", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Li:2002:UIO, author = "Tao Li and Lizy Kurian John and Anand Sivasubramaniam and N. Vijaykrishnan and Juan Rubio", title = "Understanding and improving operating system effects in control flow prediction", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "68--80", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Levis:2002:MTV, author = "Philip Levis and David Culler", title = "{Mat{\'e}}: a tiny virtual machine for sensor networks", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "85--95", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Juang:2002:EEC, author = "Philo Juang and Hidekazu Oki and Yong Wang and Margaret Martonosi and Li Shiuan Peh and Daniel Rubenstein", title = "Energy-efficient computing for wildlife tracking: design tradeoffs and early experiences with {ZebraNet}", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "96--107", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kirovski:2002:ETS, author = "Darko Kirovski and Milenko Drini{\'c} and Miodrag Potkonjak", title = "Enabling trusted software integrity", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "108--120", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zeng:2002:EME, author = "Heng Zeng and Carla S. Ellis and Alvin R. Lebeck and Amin Vahdat", title = "{ECOSystem}: managing energy as a first class operating system resource", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "123--132", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ashok:2002:CMC, author = "Raksit Ashok and Saurabh Chheda and Csaba Andras Moritz", title = "{Cool-Mem}: combining statically speculative memory accessing with selective address translation for energy efficiency", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "133--143", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sasanka:2002:JLG, author = "Ruchira Sasanka and Christopher J. Hughes and Sarita V. Adve", title = "Joint local and global hardware adaptations for energy", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "144--155", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kim:2002:DEC, author = "Dongkeun Kim and Donald Yeung", title = "Design and evaluation of compiler algorithms for pre-execution", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "159--170", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhai:2002:COS, author = "Antonia Zhai and Christopher B. Colohan and J. Gregory Steffan and Todd C. Mowry", title = "Compiler optimization of scalar value communication between speculative threads", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "171--183", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Oplinger:2002:ESR, author = "Jeffrey Oplinger and Monica S. Lam", title = "Enhancing software reliability with speculative threads", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "184--196", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Butts:2002:DDI, author = "J. Adam Butts and Guri Sohi", title = "Dynamic dead-instruction detection and elimination", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "199--210", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kim:2002:ANU, author = "Changkyu Kim and Doug Burger and Stephen W. Keckler", title = "An adaptive, non-uniform cache structure for wire-delay dominated on-chip caches", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "211--222", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mukherjee:2002:CSA, author = "Shubhendu S. Mukherjee and Federico Silla and Peter Bannon and Joel Emer and Steve Lang and David Webb", title = "A comparative study of arbitration algorithms for the {Alpha 21364} pipelined router", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "223--234", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kim:2002:IWS, author = "Hyong-youb Kim and Vijay S. Pai and Scott Rixner", title = "Increasing {Web} server throughput with network interface data caching", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "239--250", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kohler:2002:PLO, author = "Eddie Kohler and Robert Morris and Benjie Chen", title = "Programming language optimizations for modular router configurations", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "251--263", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sivathanu:2002:ERA, author = "Muthian Sivathanu and Andrea C. Arpaci-Dusseau and Remzi H. Arpaci-Dusseau", title = "Evolving {RPC} for active storage", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "264--276", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cooksey:2002:SCD, author = "Robert Cooksey and Stephan Jourdan and Dirk Grunwald", title = "A stateless, content-directed data prefetching mechanism", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "279--290", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gordon:2002:SCC, author = "Michael I. Gordon and William Thies and Michal Karczmarek and Jasper Lin and Ali S. Meli and Andrew A. Lamb and Chris Leger and Jeremy Wong and Henry Hoffmann and David Maze and Saman Amarasinghe", title = "A stream compiler for communication-exposed architectures", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "291--303", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Witchel:2002:MMP, author = "Emmett Witchel and Josh Cates and Krste Asanovi{\'c}", title = "{Mondrian} memory protection", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "304--316", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dennis:2003:FBM, author = "Jack B. Dennis", title = "Fresh {Breeze}: a multiprocessor chip architecture guided by modular programming principles", journal = j-COMP-ARCH-NEWS, volume = "31", number = "1", pages = "7--15", month = mar, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Morano:2003:RHI, author = "D. Morano and A. Khalafi and D. R. Kaeli and A. K. Uht", title = "Realizing high {IPC} through a scalable memory-latency tolerant multipath microarchitecture", journal = j-COMP-ARCH-NEWS, volume = "31", number = "1", pages = "16--25", month = mar, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Almasi:2003:DCD, author = "George Alm{\'a}si and C{\u{a}}lin Ca{\c{s}}caval and Jos{\'e} G. Casta{\~n}os and Monty Denneau and Derek Lieber and Jos{\'e} E. Moreira and Henry S. {Warren, Jr.}", title = "Dissecting {Cyclops}: a detailed analysis of a multithreaded architecture", journal = j-COMP-ARCH-NEWS, volume = "31", number = "1", pages = "26--38", month = mar, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zahran:2003:CMH, author = "Mohamed M. Zahran", title = "On cache memory hierarchy for {Chip-Multiprocessor}", journal = j-COMP-ARCH-NEWS, volume = "31", number = "1", pages = "39--48", month = mar, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Grewal:2003:EAC, author = "Gary Gr{\'e}wal and Tom Wilson and Andrew Morton", title = "An {EGA} approach to the compile-time assignment of data to multiple memories in digital-signal processors", journal = j-COMP-ARCH-NEWS, volume = "31", number = "1", pages = "49--59", month = mar, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ramacher:2003:GVP, author = "Ulrich Ramacher and Nico Br{\"u}s and Ulrich Hachmann and Jens Harnisch and Wolfgang Raab and Axel Techmer", title = "{100 GOPS} vision processor for automotive applications", journal = j-COMP-ARCH-NEWS, volume = "31", number = "1", pages = "60--68", month = mar, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pitsianis:2003:IVM, author = "Nikos P. Pitsianis and Gerald G. Pechanek", title = "Indirect {VLIW} memory allocation for the {ManArray} multiprocessor {DSP}", journal = j-COMP-ARCH-NEWS, volume = "31", number = "1", pages = "69--74", month = mar, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shimizu:2003:TLS, author = "Naohiko Shimizu and Ken Takatori", title = "A transparent {Linux} super page kernel for {Alpha}, {Sparc64} and {IA32}: reducing {TLB} misses of applications", journal = j-COMP-ARCH-NEWS, volume = "31", number = "1", pages = "75--84", month = mar, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bechini:2003:FGD, author = "Alessio Bechini and Pierfrancesco Foglia and Cosimo Antonio Prete", title = "Fine-grain design space exploration for a cartographic {SoC} multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "31", number = "1", pages = "85--92", month = mar, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2003:INa, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "31", number = "1", pages = "93--96", month = mar, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Skadron:2003:TAM, author = "Kevin Skadron and Mircea R. Stan and Wei Huang and Sivakumar Velusamy and Karthik Sankaranarayanan and David Tarjan", title = "Temperature-aware microarchitecture", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "2--13", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Magklis:2003:PBD, author = "Grigorios Magklis and Michael L. Scott and Greg Semeraro and David H. Albonesi and Steven Dropsho", title = "Profile-based dynamic voltage and frequency scaling for a multiple clock domain microprocessor", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "14--27", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kim:2003:HPA, author = "Ilhyun Kim and Mikko H. Lipasti", title = "Half-price architecture", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "28--38", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Park:2003:IMP, author = "Il Park and Babak Falsafi and T. N. Vijaykumar", title = "Implicitly-multithreaded processors", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "39--51", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Citron:2003:MPM, author = "Daniel Citron", title = "{MisSPECulation}: partial and misleading use of {SPEC CPU2000} in computer architecture conferences", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "52--61", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tseng:2003:BMR, author = "Jessica H. Tseng and Krste Asanovi{\'c}", title = "Banked multiported register files for high-frequency superscalar microprocessors", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "62--71", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Powell:2003:PDM, author = "Michael D. Powell and T. N. Vijaykumar", title = "Pipeline damping: a microarchitectural technique to reduce inductive noise in supply voltage", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "72--83", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wunderlich:2003:SAM, author = "Roland E. Wunderlich and Thomas F. Wenisch and Babak Falsafi and James C. Hoe", title = "{SMARTS}: accelerating microarchitecture simulation via rigorous statistical sampling", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "84--97", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gomaa:2003:TFR, author = "Mohamed Gomaa and Chad Scarbrough and T. N. Vijaykumar and Irith Pomeranz", title = "Transient-fault recovery for chip multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "98--109", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Prvulovic:2003:RUT, author = "Milos Prvulovic and Josep Torrellas", title = "{ReEnact}: using thread-level speculation mechanisms to debug data races in multithreaded codes", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "110--121", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Xu:2003:FDR, author = "Min Xu and Rastislav Bodik and Mark D. Hill", title = "A ``flight data recorder'' for enabling full-system multiprocessor deterministic replay", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "122--135", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhang:2003:HCC, author = "Chuanjun Zhang and Frank Vahid and Walid Najjar", title = "A highly configurable cache architecture for embedded systems", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "136--146", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Buyuktosunoglu:2003:EEC, author = "Alper Buyuktosuno{\u{g}}lu and Tejas Karkhanis and David H. Albonesi and Pradip Bose", title = "Energy efficient co-adaptive instruction fetch and issue", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "147--156", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Huang:2003:PAP, author = "Michael C. Huang and Jose Renau and Josep Torrellas", title = "Positional adaptation of processors: application to energy reduction", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "157--168", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gurumurthi:2003:DDS, author = "Sudhanva Gurumurthi and Anand Sivasubramaniam and Mahmut Kandemir and Hubertus Franke", title = "{DRPM}: dynamic speed control for power management in server class disks", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "169--181", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Martin:2003:TCD, author = "Milo M. K. Martin and Mark D. Hill and David A. Wood", title = "Token coherence: decoupling performance and correctness", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "182--193", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Singh:2003:GLB, author = "Arjun Singh and William J. Dally and Amit K. Gupta and Brian Towles", title = "{GOAL}: a load-balanced adaptive routing algorithm for torus networks", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "194--205", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Martin:2003:UDS, author = "Milo M. K. Martin and Pacia J. Harper and Daniel J. Sorin and Mark D. Hill and David A. Wood", title = "Using destination-set prediction to improve the latency\slash bandwidth tradeoff in shared-memory multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "206--217", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cvetanovic:2003:PAA, author = "Zarka Cvetanovic", title = "Performance analysis of the {Alpha 21364}-based {HP GS1280} multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "218--229", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Oberoi:2003:PFE, author = "Paramjit S. Oberoi and Gurindar S. Sohi", title = "Parallelism in the front-end", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "230--240", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Seznec:2003:EAP, author = "Andr{\'e} Seznec and Antony Fraboulet", title = "Effective ahead pipelining of instruction block address generation", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "241--252", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ernst:2003:CBF, author = "Dan Ernst and Andrew Hamel and Todd Austin", title = "{Cyclone}: a broadcast-free dynamic instruction scheduler with selective replay", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "253--263", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bhargava:2003:IDC, author = "Ravi Bhargava and Lizy K. John", title = "Improving dynamic cluster assignment for clustered trace cache processors", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "264--274", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Balasubramonian:2003:DMC, author = "Rajeev Balasubramonian and Sandhya Dwarkadas and David H. Albonesi", title = "Dynamically managing the communication-parallelism trade-off in future clustered processors", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "275--287", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sherwood:2003:PMA, author = "Timothy Sherwood and George Varghese and Brad Calder", title = "A pipelined memory architecture for high throughput network processors", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "288--299", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hasan:2003:EUM, author = "Jahangir Hasan and Satish Chandra and T. N. Vijaykumar", title = "Efficient use of memory bandwidth to improve network processor throughput", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "300--313", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thomas:2003:IBP, author = "Renju Thomas and Manoj Franklin and Chris Wilkerson and Jared Stark", title = "Improving branch prediction by dynamic dataflow-based identification of correlated branches from a large global history", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "314--323", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhou:2003:DGS, author = "Huiyang Zhou and Jill Flanagan and Thomas M. Conte", title = "Detecting global stride locality in value streams", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "324--335", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sherwood:2003:PTP, author = "Timothy Sherwood and Suleyman Sair and Brad Calder", title = "Phase tracking and prediction", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "336--349", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anantaraman:2003:VSA, author = "Aravindh Anantaraman and Kiran Seth and Kaustubh Patil and Eric Rotenberg and Frank Mueller", title = "Virtual simple architecture {(VISA)}: exceeding the complexity limit in safe real-time systems", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "350--361", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Corliss:2003:DPM, author = "Marc L. Corliss and E. Christopher Lewis and Amir Roth", title = "{DISE}: a programmable macro engine for customizing applications", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "362--373", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Oskin:2003:BQW, author = "Mark Oskin and Frederic T. Chong and Isaac L. Chuang and John Kubiatowicz", title = "Building quantum wires: the long and the short of it", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "374--387", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wang:2003:GRP, author = "Zhenlin Wang and Doug Burger and Kathryn S. McKinley and Steven K. Reinhardt and Charles C. Weems", title = "Guided region prefetching: a cooperative hardware\slash software approach", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "388--398", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kozyrakis:2003:OLC, author = "Christos Kozyrakis and David Patterson", title = "Overcoming the limitations of conventional vector processors", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "399--409", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Suh:2003:PAP, author = "Jinwoo Suh and Eun-Gyu Kim and Stephen P. Crago and Lakshmi Srinivasan and Matthew C. French", title = "A performance analysis of {PIM}, stream processing, and tiled processing on memory-intensive signal processing kernels", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "410--421", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sankaralingam:2003:EIT, author = "Karthikeyan Sankaralingam and Ramadass Nagarajan and Haiming Liu and Changkyu Kim and Jaehyuk Huh and Doug Burger and Stephen W. Keckler and Charles R. Moore", title = "Exploiting {ILP}, {TLP}, and {DLP} with the polymorphous {TRIPS} architecture", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "422--433", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:2003:JSD, author = "Michael K. Chen and Kunle Olukotun", title = "The {Jrpm} system for dynamically parallelizing {Java} programs", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "434--446", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fong:2003:CAA, author = "Anthony S. Fong", title = "A computer architecture with access control and cache option tags on individual instruction operands", journal = j-COMP-ARCH-NEWS, volume = "31", number = "3", pages = "1--5", month = jun, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tan:2003:DAP, author = "Edwin J. Tan and Wendi B. Heinzelman", title = "{DSP} architectures: past, present and futures", journal = j-COMP-ARCH-NEWS, volume = "31", number = "3", pages = "6--19", month = jun, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vintan:2003:ABP, author = "Lucian N. Vintan and Marius Sbera and Ioan Z. Mihu and Adrian Florea", title = "An alternative to branch prediction: pre-computed branches", journal = j-COMP-ARCH-NEWS, volume = "31", number = "3", pages = "20--29", month = jun, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Heinrich:2003:OWA, author = "Mark Heinrich and Mainak Chaudhuri", title = "Ocean warning: avoid drowning", journal = j-COMP-ARCH-NEWS, volume = "31", number = "3", pages = "30--32", month = jun, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lafitte:2003:QMC, author = "Jean-Louis Lafitte", title = "Qualitatively matching computer architecture with {Turing} machine", journal = j-COMP-ARCH-NEWS, volume = "31", number = "3", pages = "33--41", month = jun, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Koushiro:2003:TLV, author = "Takenori Koushiro and Toshinori Sato and Itsujiro Arita", title = "A trace-level value predictor for {Contrail} processors", journal = j-COMP-ARCH-NEWS, volume = "31", number = "3", pages = "42--47", month = jun, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2003:INb, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "31", number = "3", pages = "48--54", month = jun, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:00 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorup:2003:CPM, author = "Mikkel Thorup", title = "Combinatorial power in multimedia processors", journal = j-COMP-ARCH-NEWS, volume = "31", number = "4", pages = "5--11", month = sep, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:15 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hau:2003:SJA, author = "Gary K. W. Hau and Anthony Fong and Mok Pak Lun", title = "Support of {Java API} for the {jHISC} system", journal = j-COMP-ARCH-NEWS, volume = "31", number = "4", pages = "12--17", month = sep, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:15 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lun:2003:MMO, author = "Mok Pak Lun and Richard Li and Anthony Fong", title = "Method manipulation in an object-oriented processor", journal = j-COMP-ARCH-NEWS, volume = "31", number = "4", pages = "18--25", month = sep, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:15 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2003:INc, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "31", number = "4", pages = "26--32", month = sep, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:15 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Breen:2003:AAA, author = "Kristopher C. Breen and Duncan G. Elliott", title = "Aliasing and anti-aliasing in branch history table prediction", journal = j-COMP-ARCH-NEWS, volume = "31", number = "5", pages = "1--4", month = dec, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yu:2003:TBS, author = "Ryan W. S. Yu and Gary K. W. Hau and Anthony S. Fong", title = "Test bench for software development of object-oriented processor", journal = j-COMP-ARCH-NEWS, volume = "31", number = "5", pages = "5--9", month = dec, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lun:2003:OOP, author = "Mok Pak Lun and Anthony Fong and Gary K. W. Hau", title = "Object-oriented processor requirements with instruction analysis of {Java} programs", journal = j-COMP-ARCH-NEWS, volume = "31", number = "5", pages = "10--15", month = dec, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2003:INd, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "31", number = "5", pages = "16--21", month = dec, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{John:2004:MFS, author = "Lizy Kurian John", title = "More on finding a single number to indicate overall performance of a benchmark suite", journal = j-COMP-ARCH-NEWS, volume = "32", number = "1", pages = "3--8", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2004:INa, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "32", number = "1", pages = "9--13", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Taylor:2004:ERM, author = "Michael Bedford Taylor and Walter Lee and Jason Miller and David Wentzlaff and Ian Bratt and Ben Greenwald and Henry Hoffmann and Paul Johnson and Jason Kim and James Psota and Arvind Saraf and Nathan Shnidman and Volker Strumpen and Matt Frank and Saman Amarasinghe and Anant Agarwal", title = "Evaluation of the Raw Microprocessor: An Exposed-Wire-Delay Architecture for {ILP} and {Streams}", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "2--2", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2004:GCC, author = "Anonymous", title = "General {Co-Chair}'s Message", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "9--9", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2004:PCM, author = "Anonymous", title = "Program {Chair}'s Message", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "10--10", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2004:C, author = "Anonymous", title = "Committees", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "11--11", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2004:Ra, author = "Anonymous", title = "Reviewers", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "13--13", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ahn:2004:EIS, author = "Jung Ho Ahn and William J. Dally and Brucek Khailany and Ujval J. Kapasi and Abhishek Das", title = "Evaluating the {Imagine Stream Architecture}", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "14--14", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sias:2004:FTI, author = "John W. Sias and Sain-zee Ueng and Geoff A. Kent and Ian M. Steiner and Erik M. Nystrom and Wen-mei W. Hwu", title = "Field-testing {IMPACT EPIC} research results in {Itanium 2}", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "26--26", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vijaykumar:2004:WDP, author = "T. N. Vijaykumar and Zeshan Chishti", title = "Wire Delay is Not a Problem for {SMT} (In the Near Future)", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "40--40", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Krashinsky:2004:VTA, author = "Ronny Krashinsky and Christopher Batten and Mark Hampton and Steve Gerding and Brian Pharris and Jared Casper and Krste Asanovic", title = "The Vector-Thread Architecture", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "52--52", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kumar:2004:SIH, author = "Rakesh Kumar and Dean M. Tullsen and Parthasarathy Ranganathan and Norman P. Jouppi and Keith I. Farkas", title = "Single-{ISA} Heterogeneous Multi-Core Architectures for Multithreaded Workload Performance", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "64--64", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chou:2004:MOE, author = "Yuan Chou and Brian Fahs and Santosh Abraham", title = "Microarchitecture Optimizations for Exploiting Memory-Level Parallelism", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "76--76", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cain:2004:MOV, author = "Harold W. Cain and Mikko H. Lipasti", title = "Memory Ordering: a Value-Based Approach", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "90--90", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hammond:2004:TMC, author = "Lance Hammond and Vicky Wong and Mike Chen and Brian D. Carlstrom and John D. Davis and Ben Hertzberg and Manohar K. Prabhu and Honggo Wijaya and Christos Kozyrakis and Kunle Olukotun", title = "Transactional Memory Coherence and Consistency", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "102--102", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hangal:2004:TPV, author = "Sudheendra Hangal and Durgam Vahia and Chaiyasit Manovit and Juin-Yeu Joseph Lu", title = "{TSOtool}: a Program for Verifying Memory Systems Using the Memory Consistency Model", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "114--114", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chaudhuri:2004:SAN, author = "Mainak Chaudhuri and Mark Heinrich", title = "{SMTp}: {An Architecture} for {Next-generation Scalable Multi-threading}", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "124--124", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hughes:2004:FAF, author = "Christopher J. Hughes and Sarita V. Adve", title = "A {Formal Approach} to {Frequent Energy Adaptations} for {Multimedia Applications}", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "138--138", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Oliver:2004:SMC, author = "John Oliver and Ravishankar Rao and Paul Sultana and Jedidiah Crandall and Erik Czernikowski and Leslie W. {Jones IV} and Diana Franklin and Venkatesh Akella and Frederic T. Chong", title = "{Synchroscalar}: a Multiple Clock Domain, Power-Aware, Tile-Based Embedded Processor", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "150--150", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rosner:2004:PAT, author = "Roni Rosner and Yoav Almog and Micha Moffie and Naftali Schwartz and Avi Mendelson", title = "Power Awareness through Selective Dynamically Optimized Traces", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "162--162", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bairavasundaram:2004:XRN, author = "Lakshmi N. Bairavasundaram and Muthian Sivathanu and Andrea C. Arpaci-Dusseau and Remzi H. Arpaci-Dusseau", title = "{X-RAY}: a Non-Invasive Exclusive Caching Mechanism for {RAIDs}", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "176--176", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mullins:2004:LLV, author = "Robert Mullins and Andrew West and Simon Moore", title = "Low-Latency Virtual-Channel Routers for On-Chip Networks", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "188--188", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Puente:2004:ICR, author = "V. Puente and J. A. Gregorio and F. Vallejo and R. Beivide", title = "{Immunet}: a Cheap and Robust Fault-Tolerant Packet Routing Mechanism", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "198--198", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Alameldeen:2004:ACC, author = "Alaa R. Alameldeen and David A. Wood", title = "Adaptive Cache Compression for High-Performance Processors", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "212--212", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhou:2004:IEA, author = "Pin Zhou and Feng Qin and Wei Liu and Yuanyuan Zhou and Josep Torrellas", title = "{iWatcher}: Efficient Architectural Support for Software Debugging", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "224--224", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yehia:2004:SDI, author = "Sami Yehia and Olivier Temam", title = "From Sequences of Dependent Instructions to Functions: An Approach for Improving Performance without {ILP} or Speculation", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "238--238", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Falcon:2004:PCH, author = "Ayose Falcon and Jared Stark and Alex Ramirez and Konrad Lai and Mateo Valero", title = "Prophet\slash Critic Hybrid Branch Prediction", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "250--250", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Weaver:2004:TRS, author = "Christopher Weaver and Joel Emer and Shubhendu S. Mukherjee and Steven K. Reinhardt", title = "Techniques to Reduce the Soft Error Rate of a High-Performance Microprocessor", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "264--264", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Srinivasan:2004:CLR, author = "Jayanth Srinivasan and Sarita V. Adve and Pradip Bose and Jude A. Rivers", title = "The Case for Lifetime Reliability-Aware Microprocessors", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "276--276", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Powell:2004:ERB, author = "Michael D. Powell and T. N. Vijaykumar", title = "Exploiting Resonant Behavior to Reduce Inductive Noise", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "288--288", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Butts:2004:UBR, author = "J. Adam Butts and Gurindar S. Sohi", title = "Use-Based Register Caching with Decoupled Indexing", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "302--302", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gonzalez:2004:CAI, author = "Gonzalez Gonzalez and Adrian Cristal and Daniel Ortega and Alexander Veidenbaum and Mateo Valero", title = "A Content Aware Integer Register File Organization", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "314--314", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lipasti:2004:PRI, author = "Mikko H. Lipasti and Brian R. Mestan and Erika Gunadi", title = "Physical Register Inlining", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "325--325", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Karkhanis:2004:FOS, author = "Tejas S. Karkhanis and James E. Smith", title = "A First-Order Superscalar Processor Model", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "338--338", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Eeckhout:2004:CFM, author = "Lieven Eeckhout and Robert H. {Bell Jr.} and Bastiaan Stougie and Koen {De Bosschere} and Lizy K. John", title = "Control Flow Modeling in Statistical Simulation for Accurate and Efficient Processor Design Studies", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "350--350", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Iyer:2004:ESI, author = "Bharath Iyer and Sadagopan Srinivasan and Bruce Jacob", title = "Extended Split-Issue: Enabling Flexibility in the Hardware Implementation of {NUAL VLIW DSPs}", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "364--364", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Parashar:2004:CEA, author = "Angshuman Parashar and Sudhanva Gurumurthi and Anand Sivasubramaniam", title = "A Complexity-Effective Approach to {ALU} Bandwidth Enhancement for Instruction-Level Temporal Redundancy", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "376--376", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2004:AI, author = "Anonymous", title = "Author Index", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "387--387", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cristal:2004:CRC, author = "Adri{\'a}n Cristal and Jos{\'e} F. Mart{\'\i}nez and Josep Llosa and Mateo Valero", title = "A case for resource-conscious out-of-order processors: towards kilo-instruction in-flight processors", journal = j-COMP-ARCH-NEWS, volume = "32", number = "3", pages = "3--10", month = jun, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kundu:2004:CSI, author = "Partha Kundu and Murali Annavaram and Trung Diep and John Shen", title = "A case for shared instruction cache on chip multiprocessors running {OLTP}", journal = j-COMP-ARCH-NEWS, volume = "32", number = "3", pages = "11--18", month = jun, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Venkateswaran:2004:MPN, author = "N. Venkateswaran and Waran Research Foundation and Aditya Krishnan and S. Niranjan Kumar and Arrvindh Shriraman and Srinivas Sridharan", title = "Memory in processor: a novel design paradigm for supercomputing architectures", journal = j-COMP-ARCH-NEWS, volume = "32", number = "3", pages = "19--26", month = jun, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Branovic:2004:WCE, author = "I. Branovic and R. Giorgi and E. Martinelli", title = "A workload characterization of elliptic curve cryptography methods in embedded environments", journal = j-COMP-ARCH-NEWS, volume = "32", number = "3", pages = "27--34", month = jun, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Brifault:2004:DCM, author = "K. Brifault and H. P. Charles", title = "Data cache management on {EPIC} architecture: optimizing memory access for image processing", journal = j-COMP-ARCH-NEWS, volume = "32", number = "3", pages = "35--42", month = jun, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shimizu:2004:JOL, author = "Naohiko Shimizu and Chiaki Kon", title = "{Java} object look aside buffer for embedded applications", journal = j-COMP-ARCH-NEWS, volume = "32", number = "3", pages = "43--49", month = jun, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sakanaka:2004:LER, author = "Akihito Sakanaka and Seiichirou Fujii and Toshinori Sato", title = "A leakage-energy-reduction technique for highly-associative caches in embedded systems", journal = j-COMP-ARCH-NEWS, volume = "32", number = "3", pages = "50--54", month = jun, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Moch:2004:HSM, author = "S. Moch and M. Berekovi{\'c} and H. J. Stolberg and L. Friebe and M. B. Kulaczewski and A. Dehnhardt and P. Pirsch", title = "{HIBRID-SOC}: a multi-core architecture for image and video applications", journal = j-COMP-ARCH-NEWS, volume = "32", number = "3", pages = "55--61", month = jun, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Berekovic:2004:SCS, author = "Mladen Berekovic and S{\"o}ren Moch and Peter Pirsch", title = "A scalable, clustered {SMT} processor for digital signal processing", journal = j-COMP-ARCH-NEWS, volume = "32", number = "3", pages = "62--69", month = jun, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bartolini:2004:PIS, author = "S. Bartolini and C. A. Prete", title = "A proposal for input-sensitivity analysis of profile-driven optimizations on embedded applications", journal = j-COMP-ARCH-NEWS, volume = "32", number = "3", pages = "70--77", month = jun, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2004:INb, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "32", number = "3", pages = "78--83", month = jun, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mashey:2004:WBM, author = "John R. Mashey", title = "War of the benchmark means: time for a truce", journal = j-COMP-ARCH-NEWS, volume = "32", number = "4", pages = "1--14", month = sep, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:16 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lafitte:2004:YLL, author = "Jean-Louis Lafitte", title = "40 years later \ldots{} a new engine to handle an operating system infrastructure", journal = j-COMP-ARCH-NEWS, volume = "32", number = "4", pages = "15--22", month = sep, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:16 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2004:INc, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "32", number = "4", pages = "23--41", month = sep, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:16 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hammond:2004:PTC, author = "Lance Hammond and Brian D. Carlstrom and Vicky Wong and Ben Hertzberg and Mike Chen and Christos Kozyrakis and Kunle Olukotun", title = "Programming with transactional coherence and consistency {(TCC)}", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "1--13", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Budiu:2004:SC, author = "Mihai Budiu and Girish Venkataramani and Tiberiu Chelcea and Seth Copen Goldstein", title = "Spatial computation", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "14--26", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ekanayake:2004:ULP, author = "Virantha Ekanayake and Clinton {Kelly IV} and Rajit Manohar", title = "An ultra low-power processor for sensor networks", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "27--36", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lumb:2004:DSD, author = "Christopher R. Lumb and Richard Golding", title = "{D-SPTF}: decentralized request distribution in brick-based storage systems", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "37--47", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Saito:2004:FBD, author = "Yasushi Saito and Svend Fr{\o}lund and Alistair Veitch and Arif Merchant and Susan Spence", title = "{FAB}: building distributed enterprise disk arrays from commodity components", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "48--58", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Denehy:2004:DSA, author = "Timothy E. Denehy and John Bent and Florentina I. Popovici and Andrea C. Arpaci-Dusseau and Remzi H. Arpaci-Dusseau", title = "Deconstructing storage arrays", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "59--71", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhuang:2004:HIE, author = "Xiaotong Zhuang and Tao Zhang and Santosh Pande", title = "{HIDE}: an infrastructure for efficiently protecting information leakage on the address bus", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "72--84", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Suh:2004:SPE, author = "G. Edward Suh and Jae W. Lee and David Zhang and Srinivas Devadas", title = "Secure program execution via dynamic information flow tracking", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "85--96", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Huh:2004:CDM, author = "Jaehyuk Huh and Jichuan Chang and Doug Burger and Gurindar S. Sohi", title = "Coherence decoupling: making use of incoherence", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "97--106", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Srinivasan:2004:CFP, author = "Srikanth T. Srinivasan and Ravi Rajwar and Haitham Akkary and Amit Gandhi and Mike Upton", title = "Continual flow pipelines", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "107--119", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Desikan:2004:SSR, author = "Rajagopalan Desikan and Simha Sethumadhavan and Doug Burger and Stephen W. Keckler", title = "Scalable selective re-execution for {EDGE} architectures", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "120--132", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Regehr:2004:HSA, author = "John Regehr and Alastair Reid", title = "{HOIST}: a system for automatically deriving static analyzers for embedded systems", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "133--143", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wang:2004:HTV, author = "Perry H. Wang and Jamison D. Collins and Hong Wang and Dongkeun Kim and Bill Greene and Kai-Ming Chan and Aamir B. Yunus and Terry Sych and Stephen F. Moore and John P. Shen", title = "Helper threads via virtual multithreading on an experimental {Itanium-2} processor-based platform", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "144--155", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hauswirth:2004:LOM, author = "Matthias Hauswirth and Trishul M. Chilimbi", title = "Low-overhead memory leak detection using adaptive statistical profiling", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "156--164", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shen:2004:LPP, author = "Xipeng Shen and Yutao Zhong and Chen Ding", title = "Locality phase prediction", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "165--176", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhou:2004:DTP, author = "Pin Zhou and Vivek Pandey and Jagadeesan Sundaresan and Anand Raghuraman and Yuanyuan Zhou and Sanjeev Kumar", title = "Dynamic tracking of page miss ratio curve for memory management", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "177--188", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rabbah:2004:COP, author = "Rodric M. Rabbah and Hariharan Sandanagobalane and Mongkol Ekpanyapong and Weng-Fai Wong", title = "Compiler orchestrated prefetching via speculation and predication", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "189--198", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cher:2004:SPM, author = "Chen-Yong Cher and Antony L. Hosking and T. N. Vijaykumar", title = "Software prefetching for mark-sweep garbage collection: hardware analysis and software redesign", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "199--210", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lowell:2004:DVM, author = "David E. Lowell and Yasushi Saito and Eileen J. Samberg", title = "Devirtualizable virtual machines enabling general, single-node, online maintenance", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "211--223", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Smolens:2004:FBS, author = "Jared C. Smolens and Brian T. Gold and Jangwoo Kim and Babak Falsafi and James C. Hoe and Andreas G. Nowatzyk", title = "Fingerprinting: bounding soft-error detection latency and bandwidth", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "224--234", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bronevetsky:2004:ALC, author = "Greg Bronevetsky and Daniel Marques and Keshav Pingali and Peter Szwed and Martin Schulz", title = "Application-level checkpointing for shared memory programs", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "235--247", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wu:2004:FOM, author = "Qiang Wu and Philo Juang and Margaret Martonosi and Douglas W. Clark", title = "Formal online methods for voltage\slash frequency control in multiple clock domain microprocessors", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "248--259", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gomaa:2004:HRL, author = "Mohamed Gomaa and Michael D. Powell and T. N. Vijaykumar", title = "Heat-and-run: leveraging {SMT} and {CMP} to manage power density through the operating system", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "260--270", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Li:2004:PDE, author = "Xiaodong Li and Zhenmin Li and Francis David and Pin Zhou and Yuanyuan Zhou and Sarita Adve and Sanjeev Kumar", title = "Performance directed energy management for main memory and disks", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "271--283", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chess:2005:SAC, author = "David M. Chess", title = "Security in autonomic computing", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "2--5", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{Shi:2005:TIA, author = "Weidong Shi and Hsien-Hsin S. Lee and Chenghuai Lu and Mrinmoy Ghosh", title = "Towards the issues in architectural support for protection of software execution", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "6--15", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{McGregor:2005:PCK, author = "John P. McGregor and Ruby B. Lee", title = "Protecting cryptographic keys and computations via virtual secure coprocessing", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "16--26", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{Rogers:2005:MPH, author = "Brian Rogers and Yan Solihin and Milos Prvulovic", title = "Memory predecryption: hiding the latency overhead of memory encryption", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "27--33", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{Holland:2005:ADK, author = "David A. Holland and Ada T. Lim and Margo I. Seltzer", title = "An architecture a day keeps the hacker away", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "34--41", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{Sidiroglou:2005:HSS, author = "Stelios Sidiroglou and Michael E. Locasto and Angelos D. Keromytis", title = "Hardware support for self-healing software services", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "42--47", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{Crandall:2005:SAM, author = "Jedidiah R. Crandall and Frederic T. Chong", title = "A security assessment of the {Minos} architecture", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "48--57", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{Burnside:2005:CCP, author = "Matthew Burnside and Angelos D. Keromytis", title = "The case for crypto protocol awareness inside the {OS} kernel", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "58--64", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{Corliss:2005:UDP, author = "Marc L. Corliss and E. Christopher Lewis and Amir Roth", title = "Using {DISE} to protect return addresses from attack", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "65--72", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{Ye:2005:RRA, author = "Dong Ye and David Kaeli", title = "A reliable return address stack: microarchitectural features to defeat stack smashing", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "73--80", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{Inoue:2005:EST, author = "Koji Inoue", title = "Energy-security tradeoff in a secure cache architecture against buffer overflow attacks", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "81--89", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{Uluski:2005:CAW, author = "Derek Uluski and Micha Moffie and David Kaeli", title = "Characterizing antivirus workload execution", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "90--98", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{Aldwairi:2005:CSM, author = "Monther Aldwairi and Thomas Conte and Paul Franzon", title = "Configurable string matching hardware for speeding up intrusion detection", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "99--107", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{Milenkovic:2005:UIB, author = "Milena Milenkovi{\'c} and Aleksandar Milenkovi{\'c} and Emil Jovanov", title = "Using instruction block signatures to counter code injection attacks", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "108--117", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{Zhang:2005:ASP, author = "Youtao Zhang and Jun Yang and Yongjing Lin and Lan Gao", title = "Architectural support for protecting user privacy on trusted processors", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "118--123", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{Shirase:2005:AEC, author = "Masaaki Shirase and Yasushi Hibino", title = "An architecture for elliptic curve cryptography computation", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "124--133", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{Kgil:2005:CSS, author = "Taeho Kgil and Laura Falk and Trevor Mudge", title = "{ChipLock}: support for secure microarchitectures", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "134--143", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Workshop on Architectural Support for Security and Anti-Virus (WASSA)", } @Article{Ekman:2005:DLC, author = "Magnus Ekman and Fredrik Warg and Jim Nilsson", title = "An in-depth look at computer performance growth", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "144--147", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Venkateswaran:2005:FTB, author = "N. Venkateswaran and S. Balaji and V. Sridhar", title = "Fault tolerant bus architecture for deep submicron based processors", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "148--155", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2005:INa, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "33", number = "1", pages = "156--160", month = mar, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:2005:APC, author = "Ruby B. Lee and Peter C. S. Kwan and John P. McGregor and Jeffrey Dwoskin and Zhenghong Wang", title = "Architecture for Protecting Critical Secrets in Microprocessors", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "2--13", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2005:GCM, author = "Anonymous", title = "{General Chair}'s Message", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "9--9", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2005:PCM, author = "Anonymous", title = "Program {Chair}'s Message", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "x--xv", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shi:2005:HEC, author = "Weidong Shi and Hsien-Hsin S. Lee and Mrinmoy Ghosh and Chenghuai Lu and Alexandra Boldyreva", title = "High Efficiency Counter Mode Security Architecture via Prediction and Precomputation", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "14--24", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2005:C, author = "Anonymous", title = "Committees", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "16--16", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2005:R, author = "Anonymous", title = "Reviewers", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "xvii--xviii", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Suh:2005:DIA, author = "G. Edward Suh and Charles W. O'Donnell and Ishan Sachdev and Srinivas Devadas", title = "Design and Implementation of the {AEGIS} Single-Chip Secure Processor Using Physical Random Functions", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "25--36", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gurumurthi:2005:DDR, author = "Sudhanva Gurumurthi and Anand Sivasubramaniam and Vivek K. Natarajan", title = "Disk Drive Roadmap from the Thermal Perspective: a Case for Dynamic Thermal Management", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "38--49", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Huggahalli:2005:DCA, author = "Ram Huggahalli and Ravi Iyer and Scott Tetrick", title = "Direct Cache Access for High Bandwidth Network {I/O}", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "50--59", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gunawi:2005:DCS, author = "Haryadi S. Gunawi and Nitin Agrawal and Andrea C. Arpaci-Dusseau and Remzi H. Arpaci-Dusseau and Jiri Schindler", title = "Deconstructing Commodity Storage Clusters", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "60--71", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ekman:2005:RMM, author = "Magnus Ekman and Per Stenstr{\"o}m", title = "A Robust Main-Memory Compression Scheme", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "74--85", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fahs:2005:CO, author = "Brian Fahs and Todd Rafacz and Sanjay J. Patel and Steven S. Lumetta", title = "Continuous Optimization", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "86--97", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Petric:2005:RRB, author = "Vlad Petric and Tingting Sha and Amir Roth", title = "{RENO}: a Rename-Based Instruction Optimizer", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "98--109", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tan:2005:HTS, author = "Lin Tan and Timothy Sherwood", title = "A High Throughput String Matching Architecture for Intrusion Detection and Prevention", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "112--122", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Baboescu:2005:TBR, author = "Florin Baboescu and Dean M. Tullsen and Grigore Rosu and Sumeet Singh", title = "A Tree Based Router Search Engine Architecture with Single Port Memories", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "123--133", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kyo:2005:IMA, author = "Shorin Kyo and Shin'ichiro Okazaki and Tamio Arai", title = "An Integrated Memory Array Processor Architecture for Embedded Image Recognition Systems", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "134--145", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Reis:2005:DEH, author = "George A. Reis and Jonathan Chang and Neil Vachharajani and Ram Rangan and David I. August and Shubhendu S. Mukherjee", title = "Design and Evaluation of Hybrid Fault-Detection Systems", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "148--159", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Schuchman:2005:RMT, author = "Ethan Schuchman and T. N. Vijaykumar", title = "{Rescue}: a Microarchitecture for Testability and Defect Tolerance", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "160--171", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gomaa:2005:OTF, author = "Mohamed A. Gomaa and T. N. Vijaykumar", title = "Opportunistic Transient-Fault Detection", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "172--183", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Balensiefer:2005:EFI, author = "Steven Balensiefer and Lucas Kregor-Stickles and Mark Oskin", title = "An Evaluation Framework and Instruction Set Architecture for Ion-Trap Based Quantum Micro-Architectures", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "186--196", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nazhandali:2005:EOS, author = "Leyla Nazhandali and Bo Zhai and Javin Olson and Anna Reeves and Michael Minuth and Ryan Helfand and Sanjay Pant and Todd Austin and David Blaauw", title = "Energy Optimization of Subthreshold-Voltage Sensor Network Processors", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "197--207", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hempstead:2005:ULP, author = "Mark Hempstead and Nikhil Tripathi and Patrick Mauro and Gu-Yeon Wei and David Brooks", title = "An Ultra Low Power System Architecture for Sensor Network Applications", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "208--219", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wenisch:2005:TSS, author = "Thomas F. Wenisch and Stephen Somogyi and Nikolaos Hardavellas and Jangwoo Kim and Anastassia Ailamaki and Babak Falsafi", title = "Temporal Streaming of Shared Memory", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "222--233", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Moshovos:2005:REC, author = "Andreas Moshovos", title = "{RegionScout}: Exploiting Coarse Grain Sharing in Snoop-Based Coherence", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "234--245", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cantin:2005:IMP, author = "Jason F. Cantin and Mikko H. Lipasti and James E. Smith", title = "Improving Multiprocessor Performance with Coarse-Grain Coherence Tracking", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "246--257", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hines:2005:IPE, author = "Stephen Hines and Joshua Green and Gary Tyson and David Whalley", title = "Improving Program Efficiency by Packing Instructions into Registers", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "260--271", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Clark:2005:AFT, author = "Nathan Clark and Jason Blome and Michael Chu and Scott Mahlke and Stuart Biles and Krisztian Flautner", title = "An Architecture Framework for Transparent Instruction Set Customization in Embedded Processors", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "272--283", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Narayanasamy:2005:BCR, author = "Satish Narayanasamy and Gilles Pokam and Brad Calder", title = "{BugNet}: Continuously Recording Program Execution for Deterministic Replay Debugging", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "284--295", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Annavaram:2005:MAL, author = "Murali Annavaram and Ed Grochowski and John Shen", title = "Mitigating {Amdahl's Law} through {EPI} Throttling", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "298--309", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "energy per instruction (EPI)", } @Article{Talpes:2005:ISP, author = "Emil Talpes and Diana Marculescu", title = "Increased Scalability and Power Efficiency by Using Multiple Speed Pipelines", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "310--321", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Petric:2005:EEP, author = "Vlad Petric and Amir Roth", title = "Energy-Effectiveness of Pre-Execution and Energy-Aware {P}-Thread Selection", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "322--333", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhang:2005:VRM, author = "Michael Zhang and Krste Asanovic", title = "Victim Replication: Maximizing Capacity while Hiding Wire Delay in Tiled Chip Multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "336--345", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Speight:2005:AMP, author = "Evan Speight and Hazim Shafi and Lixin Zhang and Ram Rajamony", title = "Adaptive Mechanisms and Policies for Managing Cache Hierarchies in Chip Multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "346--356", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chishti:2005:ORC, author = "Zeshan Chishti and Michael D. Powell and T. N. Vijaykumar", title = "Optimizing Replication, Communication, and Capacity Allocation in {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "357--368", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mutlu:2005:TEP, author = "Onur Mutlu and Hyesoon Kim and Yale N. Patt", title = "Techniques for Efficient Processing in Runahead Execution Engines", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "370--381", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jimenez:2005:PLB, author = "Daniel A. Jimenez", title = "Piecewise Linear Branch Prediction", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "382--393", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Seznec:2005:AGH, author = "Andre Seznec", title = "Analysis of the {O-GEometric History Length} Branch Predictor", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "394--405", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kumar:2005:IMC, author = "Rakesh Kumar and Victor Zyuban and Dean M. Tullsen", title = "Interconnections in Multi-Core Architectures: Understanding Mechanisms, Overheads and Scaling", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "408--419", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kim:2005:MHR, author = "John Kim and William J. Dally and Brian Towles and Amit K. Gupta", title = "Microarchitecture of a High-Radix Router", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "420--431", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Seo:2005:NOW, author = "Daeho Seo and Akif Ali and Won-Taek Lim and Nauman Rafique and Mithuna Thottethodi", title = "Near-Optimal Worst-Case Throughput Routing for Two-Dimensional Mesh Networks", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "432--443", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gandhi:2005:SLS, author = "Amit Gandhi and Haitham Akkary and Ravi Rajwar and Srikanth T. Srinivasan and Konrad Lai", title = "Scalable Load and Store Processing in Latency Tolerant Processors", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "446--457", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Roth:2005:SVW, author = "Amir Roth", title = "{Store Vulnerability Window (SVW)}: Re-Execution Filtering for Enhanced Load Optimization", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "458--468", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Torres:2005:SBD, author = "E. F. Torres and P. Ibanez and V. Vinals and J. M. Llaberia", title = "Store Buffer Design in First-Level Multibanked Data Caches", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "469--480", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Meixner:2005:DVS, author = "Albert Meixner and Daniel J. Sorin", title = "Dynamic Verification of Sequential Consistency", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "482--493", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rajwar:2005:VTM, author = "Ravi Rajwar and Maurice Herlihy and Konrad Lai", title = "Virtualizing Transactional Memory", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "494--505", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Balakrishnan:2005:IPA, author = "Saisanthosh Balakrishnan and Ravi Rajwar and Mike Upton and Konrad Lai", title = "The Impact of Performance Asymmetry in Emerging Multicore Architectures", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "506--517", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Srinivasan:2005:ESD, author = "Jayanth Srinivasan and Sarita V. Adve and Pradip Bose and Jude A. Rivers", title = "Exploiting Structural Duplication for Lifetime Reliability Enhancement", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "520--531", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Biswas:2005:CAV, author = "Arijit Biswas and Paul Racunas and Razvan Cheveresan and Joel Emer and Shubhendu S. Mukherjee and Ram Rangan", title = "Computing Architectural Vulnerability Factors for Address-Based Structures", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "532--543", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Qureshi:2005:VWC, author = "Moinuddin K. Qureshi and David Thompson and Yale N. Patt", title = "The {V-Way Cache}: Demand Based Associativity via Global Replacement", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "544--555", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2005:AI, author = "Anonymous", title = "Author Index", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "556--557", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bartolini:2005:GEI, author = "S. Bartolini and P. Foglia and C. A. Prete", title = "{Guests editors'} introduction", journal = j-COMP-ARCH-NEWS, volume = "33", number = "3", pages = "1--2", month = jun, year = "2005", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1152922.1101870", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:44 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this issue of ACM SigArch Newsletter, we present eight papers from the MEDEA Workshop, held in conjunction with the International Conference on Parallel Architectures and Compilation Techniques (PACT-2004) [1], [2].", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fradj:2005:EAM, author = "Hanene Ben Fradj and Asmaa el Ouardighi and C{\'e}cile Belleudy and Michel Auguin", title = "Energy aware memory architecture configuration", journal = j-COMP-ARCH-NEWS, volume = "33", number = "3", pages = "3--9", month = jun, year = "2005", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1152922.1101871", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:44 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In the context of battery-driven embedded systems, reducing energy while maintaining performance is one of today's challenges. The on-chip memory count for a great part of the whole system consumption, especially for images and video processing applications that make heavy use of large memory data size. In this paper, we present new technique for efficiently exploiting on-chip memory space (cache, scratchpad) for a specific application to reduce the energy consumption without loss of performance. We configure and compare the impact of three different memory architectures on the energy consumption. The first one is composed of main memory with cache, in the second architecture we find a main memory and scratchpad memory and in the last architecture we combine both cache and scratchpad with the main memory. We show the effectiveness of the last architecture and a saving about 35\% in energy consumption.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Suh:2005:DOC, author = "Hyo-Joong Suh and Sung Woo Chung", title = "{DRACO}: optimized {CC-NUMA} system with novel dual-link interconnections to reduce the memory latency", journal = j-COMP-ARCH-NEWS, volume = "33", number = "3", pages = "10--16", month = jun, year = "2005", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1152922.1101872", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:44 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The performances of multiprocessor systems mainly rely on the processor clock speed and the memory latency. As the processors speed up rapidly, the memory latency becomes a major performance bottleneck in multiprocessor systems. In this paper, we propose a dual-link interconnection topology and its effective routing scheme to reduce the remote memory latency on the interconnection network. It can be applied at a same implementation cost as traditional bi-directional ring systems. We compare the performance of the proposed system to that of the traditional bi-directional ring-based system and toroidal mesh-based system. By simulations, it is shown that the proposed system outperforms the traditional bi-directional ring-based system by 42~101 \% and excels the toroidal mesh-based system by 4~14\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yehia:2005:LSA, author = "Sami Yehia and Jean-Fran{\c{c}}ois Collard and Olivier Temam", title = "Load squared: adding logic close to memory to reduce the latency of indirect loads with high miss ratios", journal = j-COMP-ARCH-NEWS, volume = "33", number = "3", pages = "17--24", month = jun, year = "2005", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1152922.1101873", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:44 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Indirect memory accesses, where a load is fed by another load, are ubiquitous because of rich data structures and sophisticated software conventions, such as the use of linkage tables and position independent code. Unfortunately, they can be costly: if both loads miss, two round trips to memory are required even though the role of the first load is often limited to fetching the address of the second load. To reduce the total latency of such indirect accesses, a new instruction called load squared is introduced. A load squared does two fetches, the first fetch reading the target address of the second. (An offset is optionally added to the result of the first fetch.) The load squared operation is performed by memory-side logic (typically, the memory controller if it isn't located on the main processor chip). In this study, load squared is not an architecturally visible instruction: the micro-architecture transparently decides which loads should be replaced by loads squared. We show that performance is sometimes improved significantly, and never degraded.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kobayashi:2005:LAC, author = "Hiroaki Kobayashi and Isao Kotera and Hiroyuki Takizawa", title = "Locality analysis to control dynamically way-adaptable caches", journal = j-COMP-ARCH-NEWS, volume = "33", number = "3", pages = "25--32", month = jun, year = "2005", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1152922.1101874", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:44 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents a control mechanism for dynamically way-adaptable caches. The mechanism uses the local and global information about the locality of reference during execution. As the local information, the cache access pattern is evaluated based on the statistics of the LRU (Least-Recently Used) states of cache entries referenced. If the memory accesses are concentrated on and near the most recently used entries, the mechanism knows that the locality of reference is very high and there is room to decrease the number of ways activated to fit the current locality. On the other hand, if the accesses are widely distributed from the most recently used entries to the least recently used ones, the mechanism understands that more ways are needed to improve the performance as long as the resources are available. In addition, to examine the global behavior of the locality of reference, an n-bit state machine like n-bit branch predictors is introduced into the mechanism. The state machine traces a sequence of cache resizing requests and evaluates its stability across the execution time. Therefore, the state machine helps the mechanism avoid unstable actions for enabling/disabling cache ways when the locality shows the highly irregular behavior. The experimental results indicate that an n-bit asymmetric state machine using the LRU status information works well to appropriately control cache ways even in the case of the benchmarks with highly-irregular access behaviors in cache references.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Arakawa:2005:SXE, author = "F. Arakawa and M. Ishikawa and Y. Kondo and T. Kamei and M. Ozawa and O. Nishii and T. Hattori", title = "{SH-X}: an embedded processor core for consumer appliances", journal = j-COMP-ARCH-NEWS, volume = "33", number = "3", pages = "33--40", month = jun, year = "2005", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1152922.1101875", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:44 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A SuperH\TM{} embedded processor core SH-X implemented in a 130-nm CMOS process running at 400 MHz achieved 720 MIPS and 2.8 GFLOPS at a power of 250 mW under worst-case conditions. It has a dual-issue seven-stage pipeline architecture, but reaches the 1.8 MIPS/MHz of the previous five-stage processor. The on-chip memory configuration is tuned for digital consumer appliances. A new resume-standby mode enables a standby current of less than 100$ \mu $A and a 3-ms recovery time. The processor meets the requirements of a wide range of applications, and is suitable for digital appliances aimed at the consumer market, such as cellular phones, digital still/video cameras, and car navigation systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Naz:2005:IDC, author = "Afrin Naz and Mehran Rezaei and Krishna Kavi and Philip Sweany", title = "Improving data cache performance with integrated use of split caches, victim cache and stream buffers", journal = j-COMP-ARCH-NEWS, volume = "33", number = "3", pages = "41--48", month = jun, year = "2005", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1152922.1101876", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:44 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In our prior work we explored a cache organization providing architectural support for distinguishing between memory references that exhibit spatial and temporal locality and mapping them to separate caches. That work showed that using separate (data) caches for indexed or stream data and scalar data items could lead to substantial improvements in terms of cache misses. In addition, such a separation allowed for the design of caches that could be tailored to meet the properties exhibited by different data items. In this paper, we investigate the interaction between three established methods: split cache, victim cache and stream buffer. Since significant amounts of compulsory and conflict misses are avoided, the size of each cache (i.e., array and scalar), as well as the combined cache capacity can be reduced. Our results show that on average 55\% reduction in miss rates over the base configuration.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "array cache; memory access time; scalar cache; stream buffer; victim cache", } @Article{Pajuelo:2005:SEH, author = "Alex Pajuelo and Antonio Gonz{\'a}lez and Mateo Valero", title = "Speculative execution for hiding memory latency", journal = j-COMP-ARCH-NEWS, volume = "33", number = "3", pages = "49--56", month = jun, year = "2005", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1152922.1101877", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:44 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "L2 misses are one of the main causes for stalling the activity in current and future microprocessors. In this paper we present a mechanism to speculatively execute independent instructions of L2-miss loads, even if no entry in the reorder buffer is available. The proposed mechanism generates future instances of instructions that are expected to be independent of the delinquent load. When these dynamic instructions are later fetched, they use the previously precomputed data and directly go to the commit stage without executing. The mechanism replicates strided loads found above the L2-miss load, that produce the data for the target independent instructions. Instructions following the L2-miss load will check if their source operands have been replicated. In this case, multiple speculative instances of them will also be generated. This mechanism is built on top of a superscalar processor with an aggressive prefetch scheme. Compared to this baseline, the mechanism obtains 21\% of performance improvement.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Verdu:2005:ITA, author = "Javier Verd{\'u} and Jorge Garc{\'\i}a and Mario Nemirovsky and Mateo Valero", title = "The impact of traffic aggregation on the memory performance of networking applications", journal = j-COMP-ARCH-NEWS, volume = "33", number = "3", pages = "57--62", month = jun, year = "2005", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1152922.1101878", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:44 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The trend of the networking processing is to increase the intelligence of the routers (i.e. security capacities). This means that there is an increment in the workload generated per packet and new types of applications are emerging, such as stateful programs. On the other hand, Internet traffic continues to grow vigorously. This fact involves an increment of the traffic aggregation levels and overloads the processing capacities of the routers. In this paper we show the importance of traffic aggregation level on networking application studies. We also classify the applications according to the data management of the packet processing. Hence, we present the different impacts on the data cache performance depending on the application category. Our results show that traffic aggregation level may affect the cache performance depending on the networking application category. Stateful applications show a significant sensitivity to this impact.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Allu:2005:ERC, author = "Bramha Allu and Wei Zhang", title = "Exploiting the replication cache to improve performance for multiple-issue microprocessors", journal = j-COMP-ARCH-NEWS, volume = "33", number = "3", pages = "63--71", month = jun, year = "2005", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1101868.1101880", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:44 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Performance and reliability are both of great importance for microprocessor design. Recently, the replication cache has been proposed to enhance data cache reliability against soft errors. The replication cache is a small fully associative cache to store the replica for every write to the L1 data cache. In addition to enhance data reliability, this paper proposes several cost-effective techniques to improve performance of multiple-issue microprocessors by exploiting the replication cache. The idea is to make use of the replication cache to increase cache bandwidth through dual load and to reduce the L1 data cache miss rate through partial victim caching. Built upon these two schemes, we also propose a hybrid approach to combine the benefits of both dual load and partial victim caching for improving performance further. Our experimental results show that exploiting a replication cache with only 8 entries can improve performance by 13.0\% on average without compromising the enhanced data integrity.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2005:INb, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "33", number = "3", pages = "72--74", month = jun, year = "2005", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1101868.1101882", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:44 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This column consists of selected traffic from the comp.arch newsgroup, a forum for discussion of computer architecture on the Internet---an international computer network.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2005:MW, author = "Anonymous", title = "{MEDEA 2004} workshop", journal = j-COMP-ARCH-NEWS, volume = "33", number = "3", pages = "??--??", month = jun, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:44 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jouppi:2005:ISI, author = "Norman P. Jouppi and Rakesh Kumar and Dean Tullsen", title = "Introduction to the special issue on the {2005 Workshop on Design, Analysis, and Simulation of Chip Multiprocessors (dasCMP'05)}", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "4--4", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Special issue: dasCMP'05.", } @Article{Laudon:2005:PWN, author = "James Laudon", title = "Performance\slash Watt: the new server focus", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "5--13", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Special issue: dasCMP'05.", } @Article{Davis:2005:RRA, author = "John D. Davis and Cong Fu and James Laudon", title = "The {RASE (Rapid, Accurate Simulation Environment)} for chip multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "14--23", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Special issue: dasCMP'05.", } @Article{Hsu:2005:ECD, author = "Lisa Hsu and Ravi Iyer and Srihari Makineni and Steve Reinhardt and Donald Newell", title = "Exploring the cache design space for large scale {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "24--33", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Special issue: dasCMP'05.", } @Article{Davis:2005:CPS, author = "John D. Davis and Stephen E. Richardson and Charis Charitsis and Kunle Olukotun", title = "A chip prototyping substrate: the flexible architecture for simulation and testing {(FAST)}", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "34--43", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Special issue: dasCMP'05.", } @Article{Vachharajani:2005:CMP, author = "Neil Vachharajani and Matthew Iyer and Chinmay Ashok and Manish Vachharajani and David I. August and Daniel Connors", title = "Chip multi-processor scalability for single-threaded applications", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "44--53", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Special issue: dasCMP'05.", } @Article{Chen:2005:HMP, author = "Julia Chen and Philo Juang and Kevin Ko and Gilberto Contreras and David Penry and Ram Rangan and Adam Stoler and Li-Shiuan Peh and Margaret Martonosi", title = "Hardware-modulated parallelism in chip multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "54--63", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Special issue: dasCMP'05.", } @Article{Sampson:2005:FSC, author = "Jack Sampson and Rub{\'e}n Gonz{\'a}lez and Jean-Fran{\c{c}}ois Collard and Norman P. Jouppi and Mike Schlansker", title = "Fast synchronization for chip multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "64--69", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Special issue: dasCMP'05.", } @Article{Shayesteh:2005:DCS, author = "Anahita Shayesteh and Glenn Reinman and Norman Jouppi and Suleyman Sair and Tim Sherwood", title = "Dynamically configurable shared {CMP} helper engines for improved performance", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "70--79", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Special issue: dasCMP'05.", } @Article{Constantinou:2005:PIS, author = "Theofanis Constantinou and Yiannakis Sazeides and Pierre Michaud and Damien Fetis and Andre Seznec", title = "Performance implications of single thread migration on a chip multi-core", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "80--91", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Special issue: dasCMP'05.", } @Article{Martin:2005:MGE, author = "Milo M. K. Martin and Daniel J. Sorin and Bradford M. Beckmann and Michael R. Marty and Min Xu and Alaa R. Alameldeen and Kevin E. Moore and Mark D. Hill and David A. Wood", title = "Multifacet's general execution-driven multiprocessor simulator {(GEMS)} toolset", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "92--99", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wang:2005:DMS, author = "David Wang and Brinda Ganesh and Nuengwong Tuaycharoen and Kathleen Baynes and Aamer Jaleel and Bruce Jacob", title = "{DRAMsim}: a memory system simulator", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "100--107", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rountree:2005:NH, author = "Barry Rountree and Robert Springer and David K. Lowenthal and Vincent W. Freeh", title = "Notes from {HPPAC 2005}", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "108--112", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wang:2005:GFB, author = "H. C. Wang and C. K. Yuen", title = "A general framework to build new {CPUs} by mapping abstract machine code to instruction level parallel execution hardware", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "113--120", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sam:2005:IMS, author = "Nana B. Sam and Martin Burtscher", title = "Improving memory system performance with energy-efficient value speculation", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "121--127", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2005:INc, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "128--133", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kaeli:2005:WIS, author = "David Kaeli and Robert Cohn", title = "{WBIA'05}: Introduction to the special issue", journal = j-COMP-ARCH-NEWS, volume = "33", number = "5", pages = "1--2", month = dec, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "WBIA'05", } @Article{Hu:2005:CCI, author = "Chunling Hu and John McCabe and Daniel A. Jim{\'e}nez and Ulrich Kremer", title = "The {Camino Compiler} infrastructure", journal = j-COMP-ARCH-NEWS, volume = "33", number = "5", pages = "3--8", month = dec, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "WBIA'05", } @Article{Schulz:2005:SDB, author = "Martin Schulz and Dong Ahn and Andrew Bernat and Bronis R. de Supinski and Steven Y. Ko and Gregory Lee and Barry Rountree", title = "Scalable dynamic binary instrumentation for {Blue Gene/L}", journal = j-COMP-ARCH-NEWS, volume = "33", number = "5", pages = "9--14", month = dec, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "WBIA'05", } @Article{Borin:2005:DBC, author = "Edson Borin and Cheng Wang and Youfeng Wu and Guido Araujo", title = "Dynamic binary control-flow errors detection", journal = j-COMP-ARCH-NEWS, volume = "33", number = "5", pages = "15--20", month = dec, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "WBIA'05", } @Article{Moffie:2005:AAS, author = "Micha Moffie and David Kaeli", title = "{ASM}: application security monitor", journal = j-COMP-ARCH-NEWS, volume = "33", number = "5", pages = "21--26", month = dec, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "WBIA'05", } @Article{Zhao:2005:DMO, author = "Qin Zhao and Rodric Rabbah and Weng-Fai Wong", title = "Dynamic memory optimization using pool allocation and prefetching", journal = j-COMP-ARCH-NEWS, volume = "33", number = "5", pages = "27--32", month = dec, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "WBIA'05", } @Article{Gao:2005:AAL, author = "Xiaofeng Gao and Beth Simon and Allan Snavely", title = "{ALITER}: an asynchronous lightweight instrumentation tool for event recording", journal = j-COMP-ARCH-NEWS, volume = "33", number = "5", pages = "33--38", month = dec, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "WBIA'05", } @Article{McCurdy:2005:UPM, author = "Collin McCurdy and Charles Fischer", title = "Using {Pin} as a memory reference generator for multiprocessor simulation", journal = j-COMP-ARCH-NEWS, volume = "33", number = "5", pages = "39--44", month = dec, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "WBIA'05", } @Article{Pan:2005:CPE, author = "Heidi Pan and Krste Asanovi{\'c} and Robert Cohn and Chi-Keung Luk", title = "Controlling program execution through binary instrumentation", journal = j-COMP-ARCH-NEWS, volume = "33", number = "5", pages = "45--50", month = dec, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "WBIA'05", } @Article{Faroughi:2005:PPP, author = "Nikrouz Faroughi", title = "Profiling of parallel processing programs on shared memory multiprocessors using {Simics}", journal = j-COMP-ARCH-NEWS, volume = "33", number = "5", pages = "51--56", month = dec, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "WBIA'05", } @Article{Kumar:2005:TDD, author = "Naveen Kumar and Ramesh Peri", title = "Transparent debugging of dynamically instrumented programs", journal = j-COMP-ARCH-NEWS, volume = "33", number = "5", pages = "57--62", month = dec, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "WBIA'05", } @Article{Harris:2005:PAS, author = "Laune C. Harris and Barton P. Miller", title = "Practical analysis of stripped binary code", journal = j-COMP-ARCH-NEWS, volume = "33", number = "5", pages = "63--68", month = dec, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "WBIA'05", } @Article{Reddi:2005:PDC, author = "Vijay Janapa Reddi and Dan Connors and Robert S. Cohn", title = "Persistence in dynamic code transformation systems", journal = j-COMP-ARCH-NEWS, volume = "33", number = "5", pages = "69--74", month = dec, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "WBIA'05", } @Article{Srinivasan:2005:MMC, author = "Ram Srinivasan and Olaf Lubeck", title = "{MonteSim}: a {Monte Carlo} performance model for in-order microarchitectures", journal = j-COMP-ARCH-NEWS, volume = "33", number = "5", pages = "75--80", month = dec, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "WBIA'05", } @Article{Laurenzano:2005:LCT, author = "Michael Laurenzano and Beth Simon and Allan Snavely and Meghan Gunn", title = "Low cost trace-driven memory simulation using {SimPoint}", journal = j-COMP-ARCH-NEWS, volume = "33", number = "5", pages = "81--86", month = dec, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "WBIA'05", } @Article{Thorson:2005:INd, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "33", number = "5", pages = "87--93", month = dec, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bartolini:2006:MPD, author = "S. Bartolini and P. Foglia and R. Giorgi and C. A. Prete", title = "Memory performance: dealing with applications, systems and architecture", journal = j-COMP-ARCH-NEWS, volume = "34", number = "1", pages = "1--2", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Friedman:2006:DCR, author = "Scott Friedman and Praveen Krishnamurthy and Roger Chamberlain and Ron K. Cytron and Jason E. Fritts", title = "Dusty caches for reference counting garbage collection", journal = j-COMP-ARCH-NEWS, volume = "34", number = "1", pages = "3--10", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ramaswamy:2006:DTC, author = "Subramanian Ramaswamy and Jaswanth Sreeram and Sudhakar Yalamanchili and Krishna V. Palem", title = "Data trace cache: an application specific cache architecture", journal = j-COMP-ARCH-NEWS, volume = "34", number = "1", pages = "11--18", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Naz:2006:MCS, author = "Afrin Naz and Krishna Kavi and Mehran Rezaei and Wentong Li", title = "Making a case for split data caches for embedded applications", journal = j-COMP-ARCH-NEWS, volume = "34", number = "1", pages = "19--26", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Allu:2006:ERC, author = "B. Allu and W. Zhang and M. Kandala", title = "Exploiting the replication cache to improve cache read bandwidth cost effectively", journal = j-COMP-ARCH-NEWS, volume = "34", number = "1", pages = "27--32", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Monchiero:2006:EST, author = "Matteo Monchiero and Gianluca Palermo and Cristina Silvano and Oreste Villa", title = "An efficient synchronization technique for multiprocessor systems on-chip", journal = j-COMP-ARCH-NEWS, volume = "34", number = "1", pages = "33--40", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Khunjush:2006:HMD, author = "Farshad Khunjush and Nikitas J. Dimopoulos", title = "Hiding message delivery and reducing memory access latency by providing direct-to-cache transfer during receive operations in a message passing environment", journal = j-COMP-ARCH-NEWS, volume = "34", number = "1", pages = "41--48", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yue:2006:NCB, author = "Yao Yue and Chuang Lin and Zhangxi Tan", title = "{NPCryptBench}: a cryptographic benchmark suite for network processors", journal = j-COMP-ARCH-NEWS, volume = "34", number = "1", pages = "49--56", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lopez-Lagunas:2006:MBO, author = "Abelardo L{\'o}pez-Lagunas and Sek M. Chai", title = "Memory bandwidth optimization through stream descriptors", journal = j-COMP-ARCH-NEWS, volume = "34", number = "1", pages = "57--64", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chiyonobu:2006:EEI, author = "Akihiro Chiyonobu and Toshinori Sato", title = "Energy-efficient instruction scheduling utilizing cache miss information", journal = j-COMP-ARCH-NEWS, volume = "34", number = "1", pages = "65--70", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bardine:2006:AEV, author = "Alessandro Bardine and Alessio Bechini and Pierfrancesco Foglia and Cosimo Antonio Prete", title = "Analysis of embedded video coder systems: a system-level approach", journal = j-COMP-ARCH-NEWS, volume = "34", number = "1", pages = "71--76", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gontmakher:2006:ILG, author = "Alex Gontmakher and Assaf Schuster and Avi Mendelson", title = "{Inthreads}: a low granularity parallelization model", journal = j-COMP-ARCH-NEWS, volume = "34", number = "1", pages = "77--80", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2006:INa, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "34", number = "1", pages = "81--86", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Patt:2006:CAR, author = "Yale Patt", title = "Computer Architecture Research and Future Microprocessors: Where Do We Go from Here?", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "2--2", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kim:2006:GDE, author = "Jongman Kim and Chrysostomos Nicopoulos and Dongkook Park", title = "A Gracefully Degrading and Energy-Efficient Modular Router Architecture for On-Chip Networks", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "4--15", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2006:MGC, author = "Anonymous", title = "Message from the General Chair", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "10--10", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2006:MPC, author = "Anonymous", title = "Message from the Program Chair", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "11--11", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2006:R, author = "Anonymous", title = "Reviewers", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "14--14", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Scott:2006:BHR, author = "Steve Scott and Dennis Abts and John Kim and William J. Dally", title = "The {BlackWidow} High-Radix {Clos} Network", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "16--28", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2006:SG, author = "Anonymous", title = "{SIGARCH} Guidelines", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "17--17", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Arvind:2006:MMI, author = "Arvind Arvind and Jan-Willem Maessen", title = "Memory Model $=$ Instruction Reordering $+$ Store Atomicity", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "29--40", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{vonPraun:2006:CMO, author = "Christoph von Praun and Harold W. Cain and Jong-Deok Choi and Kyung Dong Ryu", title = "Conditional Memory Ordering", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "41--52", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{McDonald:2006:ASP, author = "Austen McDonald and JaeWoong Chung and Brian D. Carlstrom and Chi Cao Minh and Hassan Chafi and Christos Kozyrakis and Kunle Olukotun", title = "Architectural Semantics for Practical Transactional Memory", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "53--65", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ranganathan:2006:ELP, author = "Parthasarathy Ranganathan and Phil Leech and David Irwin and Jeffrey Chase", title = "Ensemble-level Power Management for Dense Blade Servers", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "66--77", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Donald:2006:TMT, author = "James Donald and Margaret Martonosi", title = "Techniques for Multicore Thermal Management: Classification and New Exploration", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "78--88", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lin:2006:SLP, author = "Yuan Lin and Hyunseok Lee and Mark Woh and Yoav Harel and Scott Mahlke and Trevor Mudge and Chaitali Chakrabarti and Krisztian Flautner", title = "{SODA}: a Low-power Architecture For Software Radio", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "89--101", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shi:2006:IFD, author = "Weidong Shi and Hsien-Hsin S. Lee and Laura `Falk and Mrinmoy Ghosh", title = "An Integrated Framework for Dependable and Revivable Architectures Using Multicore Processors", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "102--113", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hankins:2006:MIS, author = "Richard A. Hankins and Gautham N. Chinya and Jamison D. Collins and Perry H. Wang and Ryan Rakvic and Hong Wang and John P. Shen", title = "Multiple Instruction Stream Processor", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "114--127", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Emma:2006:ESR, author = "Philip Emma", title = "The End of Scaling? Revolutions in Technology and Microarchitecture as We Pass the 90 Nanometer Node", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "128--128", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Li:2006:DMC, author = "Feihui Li and Chrysostomos Nicopoulos and Thomas Richardson and Yuan Xie and Vijaykrishnan Narayanan and Mahmut Kandemir", title = "Design and Management of {$3$D} Chip Multiprocessors Using Network-in-Memory", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "130--141", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Garg:2006:SMD, author = "Alok Garg and M. Wasiur Rashid and Michael Huang", title = "Slackened Memory Dependence Enforcement: Combining Opportunistic Forwarding with Decoupled Verification", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "142--154", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhang:2006:BCR, author = "Chuanjun Zhang", title = "Balanced Cache: Reducing Conflict Misses of Direct-Mapped Caches", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "155--166", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Qureshi:2006:CMA, author = "Moinuddin K. Qureshi and Daniel N. Lynch and Onur Mutlu and Yale N. Patt", title = "A Case for {MLP}-Aware Cache Replacement", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "167--178", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yan:2006:ICP, author = "Chenyu Yan and Daniel Englender and Milos Prvulovic and Brian Rogers and Yan Solihin", title = "Improving Cost, Performance, and Security of Memory Encryption and Authentication", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "179--190", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Brodie:2006:SAH, author = "Benjamin C. Brodie and David E. Taylor and Ron K. Cytron", title = "A Scalable Architecture For High-Throughput Regular-Expression Pattern Matching", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "191--202", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hasan:2006:CSE, author = "Jahangir Hasan and Srihari Cadambi and Venkatta Jakkula and Srimat Chakradhar", title = "{Chisel}: a Storage-efficient, Collision-free Hash-based Network Processing Architecture", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "203--215", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Colohan:2006:TDB, author = "Christopher B. Colohan and Anastassia Ailamaki and J. Gregory Steffan and Todd C. Mowry", title = "Tolerating Dependences Between Large Speculative Threads Via Sub-Threads", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "216--226", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ceze:2006:BDS, author = "Luis Ceze and James Tuck and Josep Torrellas and Calin Cascaval", title = "Bulk Disambiguation of Speculative Threads in Multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "227--238", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Choi:2006:LBS, author = "Seungryul Choi and Donald Yeung", title = "Learning-Based {SMT} Processor Resource Distribution via Hill-Climbing", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "239--251", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Somogyi:2006:SMS, author = "Stephen Somogyi and Thomas F. Wenisch and Anastassia Ailamaki and Babak Falsafi and Andreas Moshovos", title = "Spatial Memory Streaming", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "252--263", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chang:2006:CCC, author = "Jichuan Chang and Gurindar S. Sohi", title = "Cooperative Caching for Chip Multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "264--276", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hu:2006:RST, author = "Shiliang Hu and James E. Smith", title = "Reducing Startup Time in Co-Designed Virtual Machines", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "277--288", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yang:2006:TAD, author = "Qing Yang and Weijun Xiao and Jin Ren", title = "{TRAP}-Array: a Disk Array Architecture Providing Timely Recovery to Any Point-in-time", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "289--301", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Balakrishnan:2006:PDD, author = "Saisanthosh Balakrishnan and Gurindar S. Sohi", title = "Program Demultiplexing: Data-flow based Speculative Parallelization of Methods in Sequential Programs", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "302--313", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Swanson:2006:APT, author = "Steven Swanson and Andrew Putnam and Martha Mercaldi and Ken Michelson and Andrew Petersen and Andrew Schwerin and Mark Oskin and Susan J. Eggers", title = "Area-Performance Trade-offs in Tiled Dataflow Architectures", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "314--326", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Strauss:2006:FSA, author = "Karin Strauss and Xiaowei Shen and Josep Torrellas", title = "Flexible Snooping: Adaptive Forwarding and Filtering of Snoops in Embedded-Ring Multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "327--338", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cheng:2006:IAC, author = "Liqun Cheng and Naveen Muralimanohar and Karthik Ramani and Rajeev Balasubramonian and John B. Carter", title = "Interconnect-Aware Coherence Protocols for Chip Multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "339--351", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Herrod:2006:FVT, author = "Steve Herrod", title = "The Future of Virtualization Technology", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "352--352", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{VanMeter:2006:DAQ, author = "Rodney {Van Meter} and Kae Nemoto and W. J. Munro and Kohei M. Itoh", title = "Distributed Arithmetic on a Quantum Multicomputer", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "354--365", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Isailovic:2006:INS, author = "Nemanja Isailovic and Yatish Patel and Mark Whitney and John Kubiatowicz", title = "Interconnection Networks for Scalable Quantum Computers", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "366--377", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thaker:2006:QMH, author = "Darshan D. Thaker and Tzvetan S. Metodi and Andrew W. Cross and Isaac L. Chuang and Frederic T. Chong", title = "Quantum Memory Hierarchies: Efficient Designs to Match Available Parallelism in Quantum Computing", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "378--390", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2006:AI, author = "Anonymous", title = "Author Index", journal = j-COMP-ARCH-NEWS, volume = "34", number = "2", pages = "391--391", year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Aug 21 15:00:05 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Burtscher:2006:TTA, author = "Martin Burtscher", title = "{TCgen 2.0}: a tool to automatically generate lossless trace compressors", journal = j-COMP-ARCH-NEWS, volume = "34", number = "3", pages = "1--8", month = jun, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Sep 4 12:39:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kumar:2006:LLB, author = "Abhas Kumar and Nisheet Jain and Mainak Chaudhuri", title = "Long-latency branches: how much do they matter?", journal = j-COMP-ARCH-NEWS, volume = "34", number = "3", pages = "9--15", month = jun, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Sep 4 12:39:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2006:INb, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "34", number = "3", pages = "16--21", month = jun, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Sep 4 12:39:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Henning:2006:SCB, author = "John L. Henning", title = "{SPEC CPU2006} benchmark descriptions", journal = j-COMP-ARCH-NEWS, volume = "34", number = "4", pages = "1--17", month = sep, year = "2006", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1186736.1186737", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:07:09 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "On August 24, 2006, the Standard Performance Evaluation Corporation (SPEC) announced CPU2006 [2], which replaces CPU2000. The SPEC CPU benchmarks are widely used in both industry and academia [3].", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Citron:2006:HGM, author = "Daniel Citron and Adham Hurani and Alaa Gnadrey", title = "The harmonic or geometric mean: does it really matter?", journal = j-COMP-ARCH-NEWS, volume = "34", number = "4", pages = "18--25", month = sep, year = "2006", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1186736.1186738", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:07:09 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "For several decades, computer scientists have been arguing which mean is more appropriate for summarizing computer performance: the harmonic or the geometric. We show that many test cases used in the past to discredit one mean or the other are either artificial or incidental. Changing only one of the benchmarks may result in totally different conclusions. In addition, we conclude that for the SPEC CPU2000 benchmark suite, the choice of averaging has very little influence on the relative standing of different machines. Therefore, the decision to purchase one system rather then another should not be influenced by the type of averaging used.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Poe:2006:BBS, author = "James Poe and Tao Li", title = "{BASS}: a benchmark suite for evaluating architectural security systems", journal = j-COMP-ARCH-NEWS, volume = "34", number = "4", pages = "26--33", month = sep, year = "2006", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1186736.1186739", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:07:09 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As software vulnerabilities continue to be exposed on a daily basis and the motivation of cunning adversaries to compromise valuable computer assets grows, novel methods must be developed to ensure security. Recently there has been a growing interest within the computer architecture research community in designing architectural and hardware mechanisms to improve security. Unfortunately, there is currently not a representative set of benchmarks for evaluating the security features of proposed hardware modifications. The frequent result is that great effort is often spent searching for vulnerable programs, and/or evaluations suffer from a lack of diversity. To address this problem, we developed BASS, a benchmark suite to evaluate the security features of proposed architectural solutions under various malicious attack scenarios. BASS v 1.0 currently consists of seven benchmarks chosen to cover a diverse range of architectural attack characteristics. To facilitate the use of these benchmarks in architectural security research, we have developed both vulnerable programs and scripts to automatically generate exploits targeting those vulnerable programs across both 32-bit x86 and 64-bit Alpha Linux platforms. The entire BASS framework including documentation, source code, input data sets, and precompiled binaries for the M5 full system simulator is released under the Gnu GPL and can be freely downloaded at http://www.ideal.ece.ufl.edu/bass.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2006:IN, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "34", number = "4", pages = "34--37", month = sep, year = "2006", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1186736.1186741", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:07:09 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This column consists of selected traffic from the comp.arch newsgroup, a forum for discussion of computer architecture on the Internet---an international computer network. As always, the opinions expressed in this column are the personal views of the authors, and do not necessarily represent the institutions to which they are affiliated. Text which sets the context of a message appears underlined or in italics; this is usually text the author has quoted from earlier messages. The code-like expressions below the authors' names are their addresses on Internet.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rosenblum:2006:IVC, author = "Mendel Rosenblum", title = "Impact of virtualization on computer architecture and operating systems", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "1--1", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Adams:2006:CSH, author = "Keith Adams and Ole Agesen", title = "A comparison of software and hardware techniques for {x86} virtualization", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "2--13", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jones:2006:GMB, author = "Stephen T. Jones and Andrea C. Arpaci-Dusseau and Remzi H. Arpaci-Dusseau", title = "{Geiger}: monitoring the buffer cache in a virtual machine environment", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "14--24", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Crandall:2006:TSD, author = "Jedidiah R. Crandall and Gary Wassermann and Daniela A. S. de Oliveira and Zhendong Su and S. Felix Wu and Frederic T. Chong", title = "Temporal search: detecting hidden malware timebombs with virtual machines", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "25--36", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lu:2006:ADA, author = "Shan Lu and Joseph Tucek and Feng Qin and Yuanyuan Zhou", title = "{AVIO}: detecting atomicity violations via access interleaving invariants", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "37--48", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Xu:2006:RTR, author = "Min Xu and Mark D. Hill and Rastislav Bodik", title = "A regulated transitive reduction ({RTR}) for longer memory race recording", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "49--60", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bond:2006:BBE, author = "Michael D. Bond and Kathryn S. McKinley", title = "{Bell}: bit-encoding online memory leak detection", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "61--72", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shyam:2006:ULC, author = "Smitha Shyam and Kypros Constantinides and Sujay Phadke and Valeria Bertacco and Todd Austin", title = "Ultra low-cost defect protection for microprocessor pipelines", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "73--82", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Reddy:2006:UPB, author = "Vimal K. Reddy and Eric Rotenberg and Sailashri Parthasarathy", title = "Understanding prediction-based partial redundant threading for low-overhead, high-coverage fault tolerance", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "83--94", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Parashar:2006:SSB, author = "Angshuman Parashar and Anand Sivasubramaniam and Sudhanva Gurumurthi", title = "{SlicK}: slice-based locality exploitation for efficient redundant multithreading", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "95--105", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Heath:2006:MFT, author = "Taliver Heath and Ana Paula Centeno and Pradeep George and Luiz Ramos and Yogesh Jaluria", title = "{Mercury} and {Freon}: temperature emulation and management for server systems", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "106--116", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kgil:2006:PUS, author = "Taeho Kgil and Shaun D'Souza and Ali Saidi and Nathan Binkert and Ronald Dreslinski and Trevor Mudge and Steven Reinhardt and Krisztian Flautner", title = "{PicoServer}: using {$3$D} stacking technology to enable a compact energy efficient chip multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "117--128", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Coons:2006:SPS, author = "Katherine E. Coons and Xia Chen and Doug Burger and Kathryn S. McKinley and Sundeep K. Kushwaha", title = "A spatial path scheduling algorithm for {EDGE} architectures", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "129--140", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mercaldi:2006:IST, author = "Martha Mercaldi and Steven Swanson and Andrew Petersen and Andrew Putnam and Andrew Schwerin and Mark Oskin and Susan J. Eggers", title = "Instruction scheduling for a tiled dataflow architecture", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "141--150", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gordon:2006:ECG, author = "Michael I. Gordon and William Thies and Saman Amarasinghe", title = "Exploiting coarse-grained task, data, and pipeline parallelism in stream programs", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "151--162", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mishra:2006:TES, author = "Mahim Mishra and Timothy J. Callahan and Tiberiu Chelcea and Girish Venkataramani and Seth C. Goldstein and Mihai Budiu", title = "{Tartan}: evaluating spatial computation for whole program execution", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "163--174", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Eyerman:2006:PCA, author = "Stijn Eyerman and Lieven Eeckhout and Tejas Karkhanis and James E. Smith", title = "A performance counter architecture for computing accurate {CPI} components", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "175--184", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:2006:AER, author = "Benjamin C. Lee and David M. Brooks", title = "Accurate and efficient regression modeling for microarchitectural performance and power prediction", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "185--194", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ipek:2006:EEA, author = "Engin {\"I}pek and Sally A. McKee and Rich Caruana and Bronis R. de Supinski and Martin Schulz", title = "Efficiently exploring architectural design spaces via predictive modeling", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "195--206", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kharbutli:2006:CEP, author = "Mazen Kharbutli and Xiaowei Jiang and Yan Solihin and Guru Venkataramani and Milos Prvulovic", title = "Comprehensively and efficiently protecting the heap", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "207--218", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chilimbi:2006:HIH, author = "Trishul M. Chilimbi and Vinod Ganapathy", title = "{HeapMD}: identifying heap-based bugs using anomaly detection", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "219--228", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Narayanasamy:2006:RSM, author = "Satish Narayanasamy and Cristiano Pereira and Brad Calder", title = "Recording shared memory dependencies using strata", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "229--240", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Patwardhan:2006:DTS, author = "Jaidev P. Patwardhan and Vijeta Johri and Chris Dwyer and Alvin R. Lebeck", title = "A defect tolerant self-organizing nanoscale {SIMD} architecture", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "241--251", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Schuchman:2006:PTA, author = "Ethan Schuchman and T. N. Vijaykumar", title = "A program transformation and architecture support for quantum uncomputation", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "252--263", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mysore:2006:IC, author = "Shashidhar Mysore and Banit Agrawal and Navin Srivastava and Sheng-Chih Lin and Kaustav Banerjee and Tim Sherwood", title = "Introspective {$3$D} chips", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "264--273", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cantin:2006:SP, author = "Jason F. Cantin and Mikko H. Lipasti and James E. Smith", title = "Stealth prefetching", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "274--282", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chakraborty:2006:CSE, author = "Koushik Chakraborty and Philip M. Wells and Gurindar S. Sohi", title = "Computation spreading: employing hardware migration to specialize {CMP} cores on-the-fly", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "283--292", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Miller:2006:SBI, author = "Jason E. Miller and Anant Agarwal", title = "Software-based instruction caching for embedded processors", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "293--302", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Li:2006:MEM, author = "Xin Li and Marian Boldt and Reinhard von Hanxleden", title = "Mapping {Esterel} onto a multi-threaded embedded processor", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "303--314", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Binkert:2006:INI, author = "Nathan L. Binkert and Ali G. Saidi and Steven K. Reinhardt", title = "Integrated network interfaces for high-bandwidth {TCP\slash IP}", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "315--324", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tarditi:2006:AUD, author = "David Tarditi and Sidd Puri and Jose Oglesby", title = "{Accelerator}: using data parallelism to program {GPUs} for general-purpose uses", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "325--335", month = dec, year = "2006", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/1168857.1168898", ISSN = "0163-5980", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "GPUs are difficult to program for general-purpose uses. Programmers can either learn graphics APIs and convert their applications to use graphics pipeline operations or they can use stream programming abstractions of GPUs. We describe Accelerator, a system that uses data parallelism to program GPUs for general-purpose uses instead. Programmers use a conventional imperative programming language and a library that provides only high-level data-parallel operations. No aspects of GPUs are exposed to programmers. The library implementation compiles the data-parallel operations on the fly to optimized GPU pixel shader code and API calls. We describe the compilation techniques used to do this. We evaluate the effectiveness of using data parallelism to program GPUs by providing results for a set of compute-intensive benchmarks. We compare the performance of Accelerator versions of the benchmarks against hand-written pixel shaders. The speeds of the Accelerator versions are typically within 50\% of the speeds of hand-written pixel shader code. Some benchmarks significantly outperform C versions on a CPU: they are up to 18 times faster than C code running on a CPU.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Damron:2006:HTM, author = "Peter Damron and Alexandra Fedorova and Yossi Lev", title = "Hybrid transactional memory", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "336--346", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chuang:2006:UPB, author = "Weihaw Chuang and Satish Narayanasamy and Ganesh Venkatesh and Jack Sampson and Michael {Van Biesbrouck} and Gilles Pokam and Brad Calder and Osvaldo Colavin", title = "Unbounded page-based transactional memory", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "347--358", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Moravan:2006:SNT, author = "Michelle J. Moravan and Jayaram Bobba and Kevin E. Moore and Luke Yen and Mark D. Hill and Ben Liblit and Michael M. Swift and David A. Wood", title = "Supporting nested transactional memory in {logTM}", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "359--370", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chung:2006:TTM, author = "JaeWoong Chung and Chi Cao Minh and Austen McDonald and Travis Skare and Hassan Chafi and Brian D. Carlstrom and Christos Kozyrakis and Kunle Olukotun", title = "Tradeoffs in transactional memory virtualization", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "371--381", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kawahito:2006:NIR, author = "Motohiro Kawahito and Hideaki Komatsu and Takao Moriyama and Hiroshi Inoue and Toshio Nakatani", title = "A new idiom recognition framework for exploiting hardware-assist instructions", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "382--393", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bansal:2006:AGP, author = "Sorav Bansal and Alex Aiken", title = "Automatic generation of peephole superoptimizers", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "394--403", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Solar-Lezama:2006:CSF, author = "Armando Solar-Lezama and Liviu Tancau and Rastislav Bodik and Sanjit Seshia and Vijay Saraswat", title = "Combinatorial sketching for finite programs", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "404--415", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{DaSilva:2006:PPA, author = "Jeff {Da Silva} and J. Gregory Steffan", title = "A probabilistic pointer analysis for speculative optimizations", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "416--425", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tullsen:2007:ISI, author = "Dean Tullsen and Rakesh Kumar and Norman P. Jouppi", title = "Introduction to the special issue on the {2006 Workshop on Design, Analysis, and Simulation of Chip Multiprocessors: (dasCMP'06)}", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "2--2", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241605", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Chip multiprocessor architectures are becoming increasingly attractive as an option to provide high instruction throughput while keeping power and complexity under control. Such architectures have also been shown to have scalability and productivity advantages. Multi-core processors are fast becoming mainstream.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "{DASCMP'06}", } @Article{Mahesri:2007:HSS, author = "Aqeel Mahesri and Nicholas J. Wang and Sanjay J. Patel", title = "Hardware support for software controlled multithreading", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "3--12", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241606", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Chip multi-processors have emerged as one of the most effective uses of the huge number of transistors available today and in the future, but questions remain as to the best way to leverage CMPs to accelerate single threaded applications. Previous approaches rely on significant speculation to accomplish this goal. Our proposal, NXA, is less speculative than previous proposals, relying heavily on software to guarantee thread correctness, though still allowing parallelism in the presence of ambiguous dependences. It divides a single thread of execution into multiple using the master-worker paradigm where some set of master threads execute code that spawns tasks for other, worker threads. The master threads generally consist of performance critical instructions that can prefetch data, compute critical control decisions, or compute performance critical dataflow slices. This prevents non-critical instructions from competing with critical instructions for processor resources, allowing the critical thread (and thus the workload) to complete faster. Empirical results from performance simulation show a 20\% improvement in performance on a 2-way CMP machine, demonstrating that software controlled multithreading can indeed provide a benefit in the presence of hardware support.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "{DASCMP'06}", } @Article{Shi:2007:CCP, author = "Xudong Shi and Feiqi Su and Jih-kwon Peir and Ye Xia and Zhen Yang", title = "{CMP} cache performance projection: accessibility vs. capacity", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "13--20", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241607", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Efficient utilizing on-chip storage space on Chip-Multiprocessors (CMPs) has become an important research topic. Tradeoffs between data accessibility and effective on-chip capacity have been studied extensively. It requires costly simulations to understand a wide-spectrum of the design space. In this paper, we first develop an abstract model for understanding the performance impact with respect to data replication. To overcome the lack of real-time interactions among multiple cores in the abstract model, we propose a global stack simulation strategy to study the performance of a variety of cache organizations on CMPs. The global stack logically incorporates a shared stack and per-core private stacks to collect shared/private reuse (stack) distances for every memory reference in a single simulation pass. With the collected reuse distances, performance in terms of hits/misses and average memory access times can be calculated for various cache organizations. We verify the stack results against individual execution-driven simulations that consider realistic cache parameters and delays using a set of commercial multithreaded workloads. The results show that stack simulations can accurately model the performance of various cache organizations. The single-pass stack simulation results demonstrate that the effectiveness of various techniques for optimizing the CMP on-chip storage is closely related to the working sets of the workloads as well as to the total cache sizes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "CMP caches; data replication; performance modeling and projection; stack simulation", remark = "{DASCMP'06}", } @Article{Guo:2007:CQC, author = "Fei Guo and Hari Kannan and Li Zhao and Ramesh Illikkal and Ravi Iyer and Don Newell and Yan Solihin and Christos Kozyrakis", title = "From chaos to {QoS}: case studies in {CMP} resource management", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "21--30", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241608", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As more and more cores are enabled on the die of future CMP platforms, we expect that several diverse workloads will run simultaneously on the platform. A key example of this trend is the growth of virtualization usage models. When multiple virtual machines or applications or threads run simultaneously, the quality of service (QoS) that the platform provides to each individual thread is non-deterministic today. This occurs because the simultaneously running threads place very different demands on the shared resources (cache space, memory bandwidth, etc) in the platform and in most cases contend with each other. In this paper, we first present case studies that show how this results in non-deterministic performance. Unlike the compute resources managed through scheduling, platform resource allocation to individual threads cannot be controlled today. In order to provide better determinism and QoS, we then examine resource management mechanisms and present QoS-aware architectures and execution environments. The main contribution of this paper is the architecture feasibility analysis through prototypes that allow experimentation with QoS-Aware execution environments and architectural resources. We describe these QoS prototypes and then present preliminary case studies of multi-tasking and virtualization usage models sharing one critical CMP resource (last-level cache). We then demonstrate how proper management of the cache resource can provide service differentiation and deterministic performance behavior when running disparate workloads in future CMP platforms.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "{DASCMP'06}", } @Article{Kondo:2007:IFT, author = "Masaaki Kondo and Hiroshi Sasaki and Hiroshi Nakamura", title = "Improving fairness, throughput and energy-efficiency on a chip multiprocessor through {DVFS}", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "31--38", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241609", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Recently, a single chip multiprocessor (CMP) is becoming an attractive architecture for improving throughput of program execution. In CMPs, multiple processor cores share several hardware resources such as cache memory and memory bus. Therefore, the resource contention significantly degrades performance of each thread and also loses fairness between threads.\par In this paper, we propose a Dynamic Frequency and Voltage Scaling (DVFS) algorithm for improving total instruction throughput, fairness, and energy efficiency of CMPs. The proposed technique periodically observes the utilization ratio of shared resources and controls the frequency and the voltage of each processor core individually to balance the ratio between threads. We evaluate our technique and the evaluation results show that fairness between threads are greatly improved by the technique. Moreover, the total instruction throughput increases in many cases while reducing energy consumption.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "{DASCMP'06}", } @Article{Waliullah:2007:SFC, author = "M. M. Waliullah and Per Stenstrom", title = "Starvation-free commit arbitration policies for transactional memory systems", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "39--46", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241610", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In transactional memory systems like TCC, unordered transactions are committed on a first-come, first-serve basis. If a transaction has read data that has been modified by the next transaction to commit, it will have to roll-back and a lot of computations can potentially be wasted. Even worse, such simple commit arbitration policies are prone to starvation; in fact, the performance of Raytrace in SPLASH-2 suffered significantly because of this.\par This paper analyzes in detail the design issues for commit arbitration policies and proposes novel policies that reduce the amount of wasted computation due to roll-back and, most importantly, avoid starvation. We analyze in detail how to incorporate them in a TCC-like transactional memory protocol. We find that our proposed schemes have no impact on the common-case performance. In addition, they add modest complexity to the baseline protocol.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "{DASCMP'06}", } @Article{Ferri:2007:HSF, author = "Cesare Ferri and Tali Moreshet and R. Iris Bahar and Luca Benini and Maurice Herlihy", title = "A hardware\slash software framework for supporting transactional memory in a {MPSoC} environment", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "47--54", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241611", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Manufacturers are focusing on multiprocessor-system-on-a-chip (MPSoC) architectures in order to provide increased concurrency, rather than increased clock speed, for both large-scale as well as embedded systems. Traditionally lock-based synchronization is provided to support concurrency; however, managing locks can be very difficult and error prone. In addition, the performance and power cost of lock-based synchronization can be high. Transactional memories have been extensively investigated as an alternative to lock-based synchronization in general-purpose systems. It has been shown that transactional memory has advantages over locks in terms of ease of programming, performance and energy consumption. However, their applicability to embedded multi-core platforms has not been explored yet. In this paper, we demonstrate a complete hardware transactional memory solution for an embedded multi-core architecture, consisting of a cache-coherent ARM-based cluster, similar to ARM's MPCore. Using cycle accurate power and performance models for the transactional memory hardware, we evaluate our architectural framework over a set of different system and application settings, and show that transactional memory is a promising solution, even for resource-constrained embedded multiprocessors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "{DASCMP'06}", } @Article{Rul:2007:FLP, author = "Sean Rul and Hans Vandierendonck and Koen {De Bosschere}", title = "Function level parallelism driven by data dependencies", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "55--62", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241612", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With the rise of Chip multiprocessors (CMPs), the amount of parallel computing power will increase significantly in the near future. However, most programs are sequential in nature and have not been explicitly parallelized, so they cannot exploit these parallel resources. Automatic parallelization of sequential, non-regular codes is very hard, as illustrated by the lack of solutions after more than 30 years of research on the topic. The question remains if there is parallelism in sequential programs that can be detected automatically and if so, how much parallelism there is.\par In this paper, we propose a framework for extracting potential parallelism from programs. Applying this framework to sequential programs can teach us how much parallelism is present in a program, but also tells us what the most appropriate parallel construct for a program is, e.g. a pipeline, master/slave work distribution, etc.\par Our framework is profile-based, implying that it is not safe. It builds two new graph representations of the profile-data: the interprocedural data flow graph and the data sharing graph. This graphs show the data-flow between functions and the data structures facilitating this data-flow, respectively.\par We apply our framework on the SPECcpu2000 bzip2 benchmark, achieving a speedup of 3.74 of the compression part and a global speedup of 2.45 on a quad processor system.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "{DASCMP'06}", } @Article{Henning:2007:GEI, author = "John L. Henning", title = "{Guest editor}'s introduction", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "63--64", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241614", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "During the development of the new benchmark suite CPU2006, SPEC analyzed benchmark candidates for various technical attributes, including time profiles, language standard compliance, I/O activity, system resource usage, and many other attributes. Many people contributed to the analysis, as shown in the credits at www.spec.org/cpu2006/docs/credits.html. This issue of Computer Architecture News presents a set of articles flowing from that analysis effort.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Henning:2007:SCS, author = "John L. Henning", title = "{SPEC CPU} suite growth: an historical perspective", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "65--68", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241615", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Since 1989, the SPEC CPU benchmarks have aspired to ambitious goals: fair, portable, comparable tests using the compute-intensive portion of real applications. It may be difficult today to remember just how much of a challenge these goals presented when SPEC was first founded, or how much of a break they were from previous industry practice.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Phansalkar:2007:SSC, author = "Aashish Phansalkar and Ajay Joshi and Lizy K. John", title = "Subsetting the {SPEC CPU2006} benchmark suite", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "69--76", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241616", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "On August 24, 2006, the Standard Performance Evaluation Corporation (SPEC) announced CPU2006 -- the next generation of industry-standardized CPU-intensive benchmark suite. The SPEC CPU benchmark suite has become the most frequently used suite for simulation-based computer architecture research. Detailed processor simulators take days to weeks to simulate each of the SPEC CPU programs. In order to reduce simulation to a tractable time, architects and researchers often use only a subset of benchmarks from the SPEC CPU suite to evaluate the potential of their ideas. Prior research has demonstrated that statistical techniques are most effective to find a representative subset of benchmark programs from a benchmark suite. The objective of this paper is to apply multivariate statistical data analysis techniques for selecting a representative subset of programs from the SPEC CPU2006 benchmark suite. We measure a set of performance counter based characteristics for the SPEC CPU2006 programs across a large number of architectures and apply multivariate statistical analysis techniques to find a representative subset of benchmarks and representative input sets wherever multiple input sets are provided. The results from this paper will help architects and researchers to find a smaller but representative set of programs from the SPEC CPU2006 benchmark suite, when time or resource constraints prohibit experimentation with the entire benchmark suite.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wong:2007:CBS, author = "Michael Wong", title = "{C++} benchmarks in {SPEC CPU2006}", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "77--83", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241617", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In SPEC CPU2006, there are three C++ integer benchmarks and four floating-point C++ benchmarks. This paper describes the work of incorporating C++ benchmarks into SPEC CPU2006. It describes the base language standard supported and the basis for run rules adopted to maintain an even playing field for different compilers. It also describes issues that complicate porting C++ benchmarks. It describes some of the C++ Standard compliance issues that were technically interesting during the benchmark development phase, using as examples the behavior of const-correctness, nested class access of private member of enclosing class, and unneeded template instantiations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Henning:2007:SCM, author = "John L. Henning", title = "{SPEC CPU2006} memory footprint", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "84--89", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241618", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The nominal goal for memory consumption by SPEC CPU2006 benchmarks is up to about 900 MB when compiled with 32-bit pointers. The 900 MB maximum was chosen so that a system with 1GB will have about 100MB available for the operating system and overhead processes. By comparison, the goal for SPEC CPU2000 was 200MB [1].", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gove:2007:CWS, author = "Darryl Gove", title = "{CPU2006} working set size", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "90--96", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241619", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "SPEC CPU2000 had a target memory footprint of 200 MB for the benchmarks [1], to enable the suite to run on machines with 256 MB of memory. Six years have elapsed since the release of that suite, and in that time memory sizes have increased significantly, so the memory requirements for the recently released CPU2006 reflect this. CPU2006 has been targeted to have a benchmark memory footprint of about 900MB, allowing the suite to run on machines with 1GB of memory.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Korn:2007:SCS, author = "Wendy Korn and Moon S. Chang", title = "{SPEC CPU2006} sensitivity to memory page sizes", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "97--101", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241620", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "SPEC CPU2006 is a compute-intensive industry standard benchmark suite published in August 2006. This paper characterizes the memory access behavior of SPEC CPU2006 running on IBM POWER5+ microprocessors. We measure the maximum and average memory usage of the benchmarks to validate SPEC's memory requirement criteria. This paper also analyzes how different page sizes affect the performance of the benchmarks. The experiment reveals that 64 KB and 16 MB pages improve the performance up to 46.9\% and 50.9\%, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "CPI analysis; large page size; memory usage; performance optimization; SPEC CPU2006 benchmarks; workload characterization", } @Article{Weicker:2007:SPR, author = "Reinhold P. Weicker and John L. Henning", title = "Subroutine profiling results for the {CPU2006} benchmarks", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "102--111", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241621", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Subroutine profiling is a well-known performance tool. For application or system programmers, it determines 'hot spots' where the program spends most of its time, and where careful rewriting can most help performance. For compiler authors, it can give information about programming style in such hot spots, and can indicate where compiler improvements may be useful. For hardware designers and analysts, it can be the starting point to explain performance behavior.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ye:2007:CFA, author = "Dong Ye and Joydeep Ray and David Kaeli", title = "Characterization of file {I/O} activity for {SPEC CPU2006}", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "112--117", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241622", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "SPEC CPU2006 is a compute-intensive benchmark suite designed to stress a computer system's processor, memory subsystem, and compiler. To construct this suite, SPEC has selected benchmarks that are derived from real world applications. When run with their reference inputs, these programs place a significant computational burden on today's mainstream desktops as well as high-end workstations and servers.\par For these applications to thoroughly exercise the merits of a particular processor/memory design point, it is necessary to limit the amount of I/O activity generated. Since these applications come from real world applications, the suite developers have considered how best to limit the amount of file-based I/O activity present in these applications. This paper presents the characteristics of file I/O activity in the resulting suite and its overall impact on the performance of these applications. We also report on some of the choices SPEC has made in order to reduce the file I/O activity in some specific programs of the suite.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Henning:2007:PCD, author = "John L. Henning", title = "Performance counters and development of {SPEC CPU2006}", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "118--121", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241623", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Performance counters provide the means to track detailed events that occur on a CPU chip. These events are of interest to both performance analysts and compiler developers. Counting them provides essential clues to guide performance improvement. For example, a tester who sees that a program has a high cache miss rate on a particular system may experiment with compilation options that improve prefetching. A compiler developer who sees the same thing may realize that the code generator's machine model is missing some crucial detail of behavior on that particular system.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gove:2007:ECB, author = "Darryl Gove and Lawrence Spracklen", title = "Evaluating the correspondence between training and reference workloads in {SPEC CPU2006}", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "122--129", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241624", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Profile feedback (sometimes called Feedback Directed Optimisation FDO) is a useful technique for providing the compiler with additional information about runtime program flow. The compiler is able to use this information to make optimisation decisions that improve the way the code is laid out in memory or determine which routines are inlined, and hence improve the performance of the application.\par The use of profile feedback requires the code to be compiled twice. The first time the compiler generates an instrumented version of the application. This instrumented version is then run on one or more 'representative' training workloads to gather profile data. This profile data contains information such as how many times each routine is executed and how frequently each branch is taken. The second pass through the compiler uses this information to make more enlightened optimisation decisions.\par The quality of the training data impacts the ability of the compiler to do the best job that it can. This paper discusses a method of assessing the similarity of the training workload to the reference workload, and applies this methodology to evaluate the training workloads in the SPEC CPU2006 benchmark suite.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Spradling:2007:SCB, author = "Cloyce D. Spradling", title = "{SPEC CPU2006} benchmark tools", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "130--134", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241625", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The benchmarks that make up the SPEC CPU2006 benchmark suite are set-up, run, timed, and scored by the CPU tools harness. The tools have evolved over time from a collection of edit-it-yourself makefiles, scripts, and an Excel spreadsheet to the current Perl-based suite. The basic purpose of the tools is to make life easier for the benchmarker; they make it easier to tweak compilation settings, easier to keep track of those settings, and most importantly, they make it easier to follow the run and reporting rules.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sridhar:2007:HLO, author = "Swaroop Sridhar and Jonathan S. Shapiro and Prashanth P. Bungale", title = "{HDTrans}: a low-overhead dynamic translator", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "135--140", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241602", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Dynamic translation is a general purpose tool used for instrumenting programs at run time. Many current translators perform substantial rewriting during translation in an attempt to reduce execution time. When dynamic translation is used as a ubiquitous policy enforcement mechanism, the majority of program executions have no dominating inner loop that can be used to amortize the cost of translation. Even under more favorable usage assumptions, our measurements show that such optimizations offer no significant benefit in most cases. A simpler, more maintainable, adaptable, and smaller translator may be preferable to more complicated designs.\par In this paper, we present HDTrans, a light-weight IA-32 to IA-32 binary translation system that uses some simple and effective translation techniques in combination with established trace linearization and code caching optimizations. We also present an evaluation of translation overhead under non-ideal conditions, showing that conventional benchmarks do not provide a good prediction of translation overhead when used pervasively.\par A further contribution of this paper is an analysis of the effectiveness of post-compile static pre-translation techniques for overhead reduction. Our results indicate that static pre-translation is effective only when expensive instrumentation or optimization is performed, and that efficient reload of pre-translated code incurs a substantial execution-time penalty.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yan:2007:HMC, author = "Jun Yan and Wei Zhang", title = "Hybrid multi-core architecture for boosting single-threaded performance", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "141--148", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241603", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The scaling of technology and the diminishing return of complicated uniprocessors have driven the industry towards multicore processors. While multithreaded applications can naturally leverage the enhanced throughput of multi-core processors, a large number of important applications are single-threaded, which cannot automatically harness the potential of multi-core processors. In this paper, we propose a compiler-driven heterogeneous multicore architecture, consisting of tightly-integrated VLIW (Very Long Instruction Word) and superscalar processors on a single chip, to automatically boost the performance of single-threaded applications without compromising the capability to support multithreaded programs. In the proposed multi-core architecture, while the high-performance VLIW core is used to run code segments with high instruction-level parallelism (ILP) extracted by the compiler; the superscalar core can be exploited to deal with the runtime events that are typically difficult for the VLIW core to handle, such as L2 cache misses. Our initial experimental results by running the preexecution thread on the superscalar core to mitigate the L2 cache misses of the main thread on the VLIW core indicate that the proposed VLIW/superscalar multi-core processor can automatically improve the performance of single-threaded general-purpose applications by up to 40.8\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2007:INa, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "149--154", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241627", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This column consists of selected traffic from the comp.arch newsgroup, a forum for discussion of computer architecture on the Internet---an international computer network.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shaw:2007:ASP, author = "David E. Shaw and Martin M. Deneroff and Ron O. Dror and Jeffrey S. Kuskin and Richard H. Larson and John K. Salmon and Cliff Young and Brannon Batson and Kevin J. Bowers and Jack C. Chao and Michael P. Eastwood and Joseph Gagliardo and J. P. Grossman and C. Richard Ho and Douglas J. Ierardi and Istv{\'a}n Kolossv{\'a}ry and John L. Klepeis and Timothy Layman and Christine McLeavey and Mark A. Moraes and Rolf Mueller and Edward C. Priest and Yibing Shan and Jochen Spengler and Michael Theobald and Brian Towles and Stanley C. Wang", title = "{Anton}, a special-purpose machine for molecular dynamics simulation", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "1--12", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250664", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The ability to perform long, accurate molecular dynamics (MD) simulations involving proteins and other biological macro-molecules could in principle provide answers to some of the most important currently outstanding questions in the fields of biology, chemistry and medicine. A wide range of biologically interesting phenomena, however, occur over time scales on the order of a millisecond--about three orders of magnitude beyond the duration of the longest current MD simulations.\par In this paper, we describe a massively parallel machine called Anton, which should be capable of executing millisecond-scale classical MD simulations of such biomolecular systems. The machine, which is scheduled for completion by the end of 2008, is based on 512 identical MD-specific ASICs that interact in a tightly coupled manner using a specialized high-speed communication network. Anton has been designed to use both novel parallel algorithms and special-purpose logic to dramatically accelerate those calculations that dominate the time required for a typical MD simulation. The remainder of the simulation algorithm is executed by a programmable portion of each chip that achieves a substantial degree of parallelism while preserving the flexibility necessary to accommodate anticipated advances in physical models and simulation methods.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "bioinformatics; biomolecular system simulation; computational biology; computational drug design; molecular dynamics; protein folding; protein structure; special-purpose machine", } @Article{Fan:2007:PPW, author = "Xiaobo Fan and Wolf-Dietrich Weber and Luiz Andre Barroso", title = "Power provisioning for a warehouse-sized computer", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "13--23", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250665", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Large-scale Internet services require a computing infrastructure that can be appropriately described as a warehouse-sized computing system. The cost of building datacenter facilities capable of delivering a given power capacity to such a computer can rival the recurring energy consumption costs themselves. Therefore, there are strong economic incentives to operate facilities as close as possible to maximum capacity, so that the non-recurring facility costs can be best amortized. That is difficult to achieve in practice because of uncertainties in equipment power ratings and because power consumption tends to vary significantly with the actual computing activity. Effective power provisioning strategies are needed to determine how much computing equipment can be safely and efficiently hosted within a given power budget.\par In this paper we present the aggregate power usage characteristics of large collections of servers (up to 15 thousand) for different classes of applications over a period of approximately six months. Those observations allow us to evaluate opportunities for maximizing the use of the deployed power capacity of datacenters, and assess the risks of over-subscribing it. We find that even in well-tuned applications there is a noticeable gap (7 - 16\%)between achieved and theoretical aggregate peak power usage at the cluster level (thousands of servers). The gap grows to almost 40\% in whole datacenters. This headroom can be used to deploy additional compute equipment within the same power budget with minimal risk of exceeding it. We use our modeling framework to estimate the potential of power management schemes to reduce peak power and energy usage. We find that the opportunities for power and energy savings are significant, but greater at the cluster-level (thousands of servers) than at the rack-level (tens). Finally we argue that systems need to be power efficient across the activity range, and not only at peak performance levels.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "energy efficiency; power modeling; power provisioning", } @Article{Blundell:2007:MFC, author = "Colin Blundell and Joe Devietti and E. Christopher Lewis and Milo M. K. Martin", title = "Making the fast case common and the uncommon case simple in unbounded transactional memory", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "24--34", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250667", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Hardware transactional memory has great potential to simplify the creation of correct and efficient multithreaded programs, allowing programmers to exploit more effectively the soon-to-be-ubiquitous multi-core designs. Several recent proposals have extended the original bounded transactional memory to unbounded transactional memory, a crucial step toward transactions becoming a general-purpose primitive. Unfortunately, supporting the concurrent execution of an unbounded number of unbounded transactions is challenging, and as a result, many proposed implementations are complex.\par This paper explores a different approach. First, we introduce the permissions-only cache to extend the bound at which transactions overflow to allow the fast, bounded case to be used as frequently as possible. Second, we propose OneTM to simplify the implementation of unbounded transactional memory by bounding the concurrency of transactions that overflow the cache. These mechanisms work synergistically to provide a simple and fast unbounded transactional memory system.\par The permissions-only cache efficiently maintains the coherence permissions --- but not data-for blocks read or written transactionally that have been evicted from the processor's caches. By holding coherence permissions for these blocks, the regular cache coherence protocol can be used to detect transactional conflicts using only a few bits of on-chip storage per overflowed cache block. OneTM allows only one overflowed transaction at a time, relying on the permissions-only cache to ensure that overflow is infrequent. We present two implementations. In OneTM-Serialized, an overflowed transaction simply stalls all other threads in the application.\par In OneTM-Concurrent, non-overflowed transactions and non-transactional code can execute concurrently with the overflowed transaction, providing more concurrency while retaining OneTM's core simplifying assumption.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "concurrency; parallel programming; transactional memory; transactions", } @Article{Zhu:2007:SSB, author = "Weirong Zhu and Vugranam C. Sreedhar and Ziang Hu and Guang R. Gao", title = "Synchronization state buffer: supporting efficient fine-grain synchronization on many-core architectures", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "35--45", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250668", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Efficient fine-grain synchronization is extremely important to effectively harness the computational power of many-core architectures. However, designing and implementing fine-grain synchronization in such architectures presents several challenges, including issues of synchronization induced overhead, storage cost, scalability, and the level of granularity to which synchronization is applicable. This paper proposes the Synchronization State Buffer ( SS B), a scalable architectural design for fine-grain synchronization that efficiently performs synchronizations between concurrent threads. The design of SSB is motivated by the following observation: at any instance during the parallel execution only a small fraction of memory locations are actively participating in synchronization. Based on this observation we present a fine-grain synchronization design that records and manages the states of frequently synchronized data using modest hardware support. We have implemented the SSB design in the context of the 160-core IBM Cyclops-64 architecture. Using detailed simulation, we present our experience for a set of benchmarks with different workload characteristics.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "fine-grain synchronization; many-core; SSB", } @Article{Marty:2007:VHS, author = "Michael R. Marty and Mark D. Hill", title = "Virtual hierarchies to support server consolidation", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "46--56", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250670", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Server consolidation is becoming an increasingly popular technique to manage and utilize systems. This paper develops CMP memory systems for server consolidation where most sharing occurs within Virtual Machines (VMs). Our memory systems maximize shared memory accesses serviced within a VM, minimize interference among separate VMs, facilitate dynamic reassignment of VMs to processors and memory, and support content-based page sharing among VMs. We begin with a tiled architecture where each of 64 tiles contains a processor, private L1 caches, and an L2 bank. First, we reveal why single-level directory designs fail to meet workload consolidation goals. Second, we develop the paper's central idea of imposing a two-level virtual (or logical) coherence hierarchy on a physically flat CMP that harmonizes with VM assignment. Third, we show that the best of our two virtual hierarchy (VH) variants performs 12-58\% better than the best alternative flat directory protocol when consolidating Apache, OLTP, and Zeus commel workloads on our simulated 64-core CMP.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cache coherence; chip multiprocessors (CMPs); memory hierarchies; multicore; partitioning; server consolidation; virtual machines", } @Article{Nesbit:2007:VPC, author = "Kyle J. Nesbit and James Laudon and James E. Smith", title = "Virtual private caches", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "57--68", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250671", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Virtual Private Machines (VPM) provide a framework for Quality of Service (QoS) in CMP-based computer systems. VPMs incorporate microarchitecture mechanisms that allow shares of hardware resources to be allocated to executing threads, thus providing applications with an upper bound on execution time regardless of other thread activity. Virtual Private Caches (VPCs) are an important element of VPMs. VPC hardware consists of two major components: the VPC Arbiter, which manages shared cache bandwidth, and the VPC Capacity Manager, which manages the cache storage. Both the VPC Arbiter and VPC Capacity Manager provide minimum service guarantees that, when combined, achieve QoS for the cache subsystem. Simulation-based evaluation shows that conventional cache bandwidth management policies allow concurrently executing threads to affect each other significantly in an uncontrollable manner. The evaluation targets cache bandwidth because the effects of cache capacity sharing have been studied elsewhere. In contrast with the conventional policies, the VPC Arbiter meets its QoS performance objectives on all workloads studied and over a range of allocated bandwidth levels. The VPC Arbiter's fairness policy, which distributes leftover bandwidth, mitigates the effects of cache preemption latencies, thus ensuring threads a high-degree of performance isolation. Furthermore, the VPC Arbiter eliminates negative bandwidth interference which can improve aggregate throughput and resource utilization.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "chip multiprocessor; performance isolation; quality of service; shared caches; soft real-time", } @Article{Minh:2007:EHT, author = "Chi Cao Minh and Martin Trautmann and JaeWoong Chung and Austen McDonald and Nathan Bronson and Jared Casper and Christos Kozyrakis and Kunle Olukotun", title = "An effective hybrid transactional memory system with strong isolation guarantees", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "69--80", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250673", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We propose signature-accelerated transactional memory (SigTM), a hybrid TM system that reduces the overhead of software transactions. SigTM uses hardware signatures to track the read-set and write-set for pending transactions and perform conflict detection between concurrent threads. All other transactional functionality, including data versioning, is implemented in software. Unlike previously proposed hybrid TM systems, SigTM requires no modifications to the hardware caches, which reduces hardware cost and simplifies support for nested transactions and multithreaded processor cores. SigTM is also the first hybrid TM system to provide strong isolation guarantees between transactional blocks and non-transactional accesses without additional read and write barriers in non-transactional code.\par Using a set of parallel programs that make frequent use of coarse-grain transactions, we show that SigTM accelerates software transactions by 30\% to 280\%. For certain workloads, SigTM can match the performance of a full-featured hardware TM system, while for workloads with large read-sets it can be up to two times slower. Overall, we show that SigTM combines the performance characteristics and strong isolation guarantees of hardware TM implementations with the low cost and flexibility of software TM systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "multi-core architectures; parallel programming; strong isolation; transactional memory", } @Article{Bobba:2007:PPH, author = "Jayaram Bobba and Kevin E. Moore and Haris Volos and Luke Yen and Mark D. Hill and Michael M. Swift and David A. Wood", title = "Performance pathologies in hardware transactional memory", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "81--91", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250674", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Hardware Transactional Memory (HTM) systems reflect choices from three key design dimensions: conflict detection, version management, and conflict resolution. Previously proposed HTMs represent three points in this design space: lazy conflict detection, lazy version management, committer wins (LL); eager conflict detection, lazy version management, requester wins (EL); and eager conflict detection, eager version management, and requester stalls with conservative deadlock avoidance (EE). To isolate the effects of these high-level design decisions, we develop a common framework that abstracts away differences in cache write policies, interconnects, and ISA to compare these three design points. Not surprisingly, the relative performance of these systems depends on the workload. Under light transactional loads they perform similarly, but under heavy loads they differ by up to 80\%. None of the systems performs best on all of our benchmarks. We identify seven performance pathologies -interactions between workload and system that degrade performance-as the root cause of many performance differences: FriendlyFire, StarvingWriter, SerializedCommit, FutileStall, StarvingElder, RestartConvoy, and DuelingUpgrades. We discuss when and on which systems these pathologies can occur and show that they actually manifest within TM workloads. The insight provided by these pathologies motivated four enhanced systems that often significantly reduce transactional memory overhead. Importantly, by avoiding transaction pathologies, each enhanced system performs well across our suite of benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "contention management; hardware; pathology; performance; transactional memory", } @Article{Ramadan:2007:MTT, author = "Hany E. Ramadan and Christopher J. Rossbach and Donald E. Porter and Owen S. Hofmann and Aditya Bhandari and Emmett Witchel", title = "{MetaTM\slash TxLinux}: transactional memory for an operating system", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "92--103", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250675", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper quantifies the effect of architectural design decisions on the performance of TxLinux. TxLinux is a Linux kernel modified to use transactions in place of locking primitives in several key subsystems. We run TxLinux on MetaTM, which is a new hardware-transaction memory (HTM) model. MetaTM contains features that enable efficient and correct interrupt handling for an x86-like architecture. Live stack overwrites can corrupt non-transactional stack memory and requires a small change to the transaction register checkpoint hardware to ensure correct operation of the operating system. We also propose stack based early release to reduce spurious conflicts on stack memory between kernel code and interrupt handlers. We use MetaTM to examine the performance sensitivity of individual architectural features. For TxLinux we find that Polka and SizeMatters are effective contention management policies, some form of backoff on transaction contention is vital for performance,and stalling on a transaction conflict reduces transaction restart rates, but does not improve performance. Transaction write sets are small, and performance is insensitive to transaction abort costs but sensitive to commit costs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "MetaTM; OS support; transactional memory; TxLinux", } @Article{Shriraman:2007:IHS, author = "Arrvindh Shriraman and Michael F. Spear and Hemayet Hossain and Virendra J. Marathe and Sandhya Dwarkadas and Michael L. Scott", title = "An integrated hardware-software approach to flexible transactional memory", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "104--115", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250676", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "There has been considerable recent interest in both hardware and software transactional memory (TM). We present an intermediate approach, in which hardware serves to accelerate a TM implementation controlled fundamentally by software. Specifically, we describe an alert on update mechanism (AOU) that allows a thread to receive fast, asynchronous notification when previously-identified lines are written by other threads, and a programmable data isolation mechanism (PDI) that allows a thread to hide its speculative writes from other threads, ignoring conflicts, until software decides to make them visible. These mechanisms reduce bookkeeping, validation, and copying overheads without constraining software policy on a host of design decisions.\par We have used AOU and PDI to implement a hardware-accelerated-software transactional memory system we call RTM. We have also used AOU alone to create a simpler 'RTM-Lite'. Across a range of microbenchmarks, RTM outperforms RSTM, a publicly available software transactional memory system, by as much as 8.7x (geometric mean of 3.5x) in single-thread mode. At 16 threads, it outperforms RSTM by as much as 5x, with an average speedup of 2x. Performance degrades gracefully when transactions overflow hardware structures. RTM-Lite is slightly faster than RTM for transactions that modify only small objects; full RTM is significantly faster when objects are large. In a strong argument for policy flexibility, we find that the choice between eager (first-access) and lazy (commit-time) conflict detection can lead to significant performance differences in both directions, depending on application characteristics.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cache coherence; multiprocessors; RSTM; transactional memory", } @Article{Abad:2007:RRE, author = "Pablo Abad and Valentin Puente and Jos{\'e} Angel Gregorio and Pablo Prieto", title = "Rotary router: an efficient architecture for {CMP} interconnection networks", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "116--125", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250678", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The trend towards increasing the number of processor cores and cache capacity in future Chip-Multiprocessors (CMPs), will require scalable packet-switched interconnection networks adapted to the restrictions imposed by the CMP environment. This paper presents an innovative router design, which successfully addresses CMP cost/performance constraints. The router structure is based on two independent rings, which force packets to circulate either clockwise or anti-clockwise, traveling through every port of the router. It uses a completely decentralized scheduling scheme, which allows the design to: (1) take advantage of wide links, (2) reduce Head of Line blocking, (3) use adaptive routing, (4) be topology agnostic, (5) scale with network degree, and (6) have reasonable power consumption and implementation cost. A thorough comparative performance analysis against competitive conventional routers shows an advantage for our proposal of up to 50 \% in terms of raw performance and nearly 60 \% in terms of energy-delay product.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "chip multi-processors; interconnection networks; router architecture", } @Article{Kim:2007:FBC, author = "John Kim and William J. Dally and Dennis Abts", title = "Flattened butterfly: a cost-efficient topology for high-radix networks", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "126--137", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250679", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Increasing integrated-circuit pin bandwidth has motivated a corresponding increase in the degree or radix of interconnection networks and their routers. This paper introduces the flattened butterfly, a cost-efficient topology for high-radix networks. On benign (load-balanced) traffic, the flattened butterfly approaches the cost/performance of a butterfly network and has roughly half the cost of a comparable performance Clos network. The advantage over the Clos is achieved by eliminating redundant hops when they are not needed for load balance. On adversarial traffic, the flattened butterfly matches the cost/performance of a folded-Clos network and provides an order of magnitude better performance than a conventional butterfly. In this case, global adaptive routing is used to switch the flattened butterfly from minimal to non-minimal routing --- using redundant hops only when they are needed. Minimal and non-minimal, oblivious and adaptive routing algorithms are evaluated on the flattened butterfly. We show that load-balancing adversarial traffic requires nonminimal globally-adaptive routing and show that sequential allocators are required to avoid transient load imbalance when using adaptive routing algorithms. We also compare the cost of the flattened butterfly to folded-Clos, hypercube,and butterfly networks with identical capacity and show that the flattened butterfly is more cost-efficient than folded-Clos and hypercube topologies.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cost model; flattened butterfly; global adaptive routing; high-radix routers; interconnection networks; topology", } @Article{Kim:2007:NDD, author = "Jongman Kim and Chrysostomos Nicopoulos and Dongkook Park and Reetuparna Das and Yuan Xie and Vijaykrishnan Narayanan and Mazin S. Yousif and Chita R. Das", title = "A novel dimensionally-decomposed router for on-chip communication in {$3$D} architectures", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "138--149", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250680", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Much like multi-story buildings in densely packed metropolises, three-dimensional (3D) chip structures are envisioned as a viable solution to skyrocketing transistor densities and burgeoning die sizes in multi-core architectures. Partitioning a larger die into smaller segments and then stacking them in a 3D fashion can significantly reduce latency and energy consumption. Such benefits emanate from the notion that inter-wafer distances are negligible compared to intra-wafer distances. This attribute substantially reduces global wiring length in 3D chips. The work in this paper integrates the increasingly popular idea of packet-based Networks-on-Chip (NoC) into a 3D setting. While NoCs have been studied extensively in the 2D realm, the microarchitectural ramifications of moving into the third dimension have yet to be fully explored. This paper presents a detailed exploration of inter-strata communication architectures in 3D NoCs. Three design options are investigated; a simple bus-based inter-wafer connection, a hop-by-hop standard 3D design, and a full 3D crossbar implementation. In this context, we propose a novel partially-connected 3D crossbar structure, called the 3D Dimensionally-Decomposed (DimDe) Router, which provides a good tradeoff between circuit complexity and performance benefits. Simulation results using (a) a stand-alone cycle-accurate 3D NoC simulator running synthetic workloads, and (b) a hybrid 3D NoC/cache simulation environment running real commercial and scientific benchmarks, indicate that the proposed DimDe design provides latency and throughput improvements of over 20\% on average over the other 3D architectures, while remaining within 5\% of the full 3D crossbar performance. Furthermore, based on synthesized hardware implementations in 90 nm technology, the DimDe architecture outperforms all other designs -- including the full 3D crossbar -- by an average of 26\% in terms of the Energy-Delay Product (EDP).", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "3D architecture; 3D integration; network-on-chip (NoC)", } @Article{Kumar:2007:EVC, author = "Amit Kumar and Li-Shiuan Peh and Partha Kundu and Niraj K. Jha", title = "Express virtual channels: towards the ideal interconnection fabric", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "150--161", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250681", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Due to wire delay scalability and bandwidth limitations inherent in shared buses and dedicated links, packet-switched on-chip interconnection networks are fast emerging as the pervasive communication fabric to connect different processing elements in many-core chips. However, current state-of-the-art packet-switched networks rely on complex routers which increases the communication overhead and energy consumption as compared to the ideal interconnection fabric.\par In this paper, we try to close the gap between the state-of-the-art packet-switched network and the ideal interconnect by proposing express virtual channels (EVCs), a novel flow control mechanism which allows packets to virtually bypass intermediate routers along their path in a completely non-speculative fashion, thereby lowering the energy/delay towards that of a dedicated wire while simultaneously approaching ideal throughput with a practical design suitable for on-chip networks.\par Our evaluation results using a detailed cycle-accurate simulator on a range of synthetic traffic and SPLASH benchmark traces show upto 84\% reduction in packet latency and upto 23\% improvement in throughput while reducing the average router energy consumption by upto 38\% over an existing state-of-the-art packet-switched design. When compared to the ideal interconnect, EVCs add just two cycles to the no-load latency, and are within 14\% of the ideal throughput. Moreover, we show that the proposed design incurs a minimal hardware overhead while exhibiting excellent scalability with increasing network sizes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "flow-control; packet-switching; router design", } @Article{Kumar:2007:CAS, author = "Sanjeev Kumar and Christopher J. Hughes and Anthony Nguyen", title = "{Carbon}: architectural support for fine-grained parallelism on chip multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "162--173", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250683", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Chip multiprocessors (CMPs) are now commonplace, and the number of cores on a CMP is likely to grow steadily. However, in order to harness the additional compute resources of a CMP, applications must expose their thread-level parallelism to the hardware. One common approach to doing this is to decompose a program into parallel 'tasks' and allow an underlying software layer to schedule these tasks to different threads. Software task scheduling can provide good parallel performance as long as tasks are large compared to the software overheads.\par We examine a set of applications from an important emerging domain: Recognition, Mining, and Synthesis (RMS). Many RMS applications are compute-intensive and have abundant thread-level parallelism, and are therefore good targets for running on a CMP. However, a significant number have small tasks for which software task schedulers achieve only limited parallel speedups.\par We propose Carbon, a hardware technique to accelerate dynamic task scheduling on scalable CMPs. Carbon has relatively simple hardware, most of which can be placed far from the cores. We compare Carbon to some highly tuned software task schedulers for a set of RMS benchmarks with small tasks. Carbon delivers significant performance improvements over the best software scheduler: on average for 64 cores, 68\% faster on a set of loop-parallel benchmarks, and 109\% faster on a set of task-parallel benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "architectural support; CMP; loop and task parallelism", } @Article{Neelakantam:2007:HAR, author = "Naveen Neelakantam and Ravi Rajwar and Suresh Srinivas and Uma Srinivasan and Craig Zilles", title = "Hardware atomicity for reliable software speculation", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "174--185", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250684", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Speculative compiler optimizations are effective in improving both single-thread performance and reducing power consumption, but their implementation introduces significant complexity, which can limit their adoption, limit their optimization scope, and negatively impact the reliability of the compilers that implement them. To eliminate much of this complexity, as well as increase the effectiveness of these optimizations, we propose that microprocessors provide architecturally-visible hardware primitives for atomic execution. These primitives provide to the compiler the ability to optimize the program's hot path in isolation, allowing the use of non-speculative formulations of optimization passes to perform speculative optimizations. Atomic execution guarantees that if a speculation invariant does not hold, the speculative updates are discarded, the register state is restored, and control is transferred to a non-speculative version of the code, thereby relieving the compiler from the responsibility of generating compensation code.\par We demonstrate the benefit of hardware atomicity in the context of a Java virtual machine. We find incorporating the notion of atomic regions into an existing compiler intermediate representation to be natural, requiring roughly 3,000 lines of code (~3\% of a JVM's optimizing compiler), most of which were for region formation. Its incorporation creates new opportunities for existing optimization passes, as well as greatly simplifying the implementation of additional optimizations (e.g., partial inlining, partial loop unrolling, and speculative lock elision). These optimizations reduce dynamic instruction count by 11\% on average and result in a 10-15\% average speedup, relative to a baseline compiler with a similar degree of inlining.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "atomicity; checkpoint; isolation; Java; optimization; speculation", } @Article{Ipek:2007:CFA, author = "Engin Ipek and Meyrem Kirman and Nevin Kirman and Jose F. Martinez", title = "Core fusion: accommodating software diversity in chip multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "186--197", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250686", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents core fusion, a reconfigurable chip multiprocessor(CMP) architecture where groups of fundamentally independent cores can dynamically morph into a larger CPU, or they can be used as distinct processing elements, as needed at run time by applications. Core fusion gracefully accommodates software diversity and incremental parallelization in CMPs. It provides a single execution model across all configurations, requires no additional programming effort or specialized compiler support, maintains ISA compatibility, and leverages mature micro-architecture technology.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "chip multiprocessors; reconfigurable architectures; software diversity", } @Article{Chi:2007:TQA, author = "Eric Chi and Stephen A. Lyon and Margaret Martonosi", title = "Tailoring quantum architectures to implementation style: a quantum computer for mobile and persistent qubits", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "198--209", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250687", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In recent years, quantum computing (QC) research has moved from the realm of theoretical physics and mathematics into real implementations. With many different potential hardware implementations, quantum computer architecture is a rich field with an opportunity to solve interesting new problems and to revisit old ones. This paper presents a QC architecture tailored to physical implementations with highly mobile and persistent quantum bits (qubits). Implementations with qubit coherency times that are much longer than operation times and qubit transportation times that are orders of magnitude faster than operation times lend greater flexibility to the architecture. This is particularly true in the placement and locality of individual qubits. For concreteness, we assume a physical device model based on electron-spin qubits on liquid helium (eSHe).\par Like many conventional computer architectures, QCs focus on the efficient exposure of parallelism. We present here a QC microarchitecture that enjoys increasing computational parallelism with size and latency scaling only linearly with the number of operations. Although an efficient and high level of parallelism is admirable, quantum hardware is still expensive and difficult to build, so we demonstrate how the software may be optimized to reduce an application's hardware requirements by 25\% with no performance loss. Because the majority of a QC's time and resources are devoted to quantum error correction, we also present noise modeling results that evaluate error correction procedures. These results demonstrate that idle qubits in memory need only be refreshed approximately once every one hundred operation cycles.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "architecture; quantum", } @Article{Yang:2007:BSP, author = "Xuejun Yang and Xiaobo Yan and Zuocheng Xing and Yu Deng and Jiang Jiang and Ying Zhang", title = "A 64-bit stream processor architecture for scientific applications", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "210--219", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250689", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Stream architecture is a novel microprocessor architecture with wide application potential. But as for whether it can be used efficiently in scientific computing, many issues await further study. This paper first gives the design and implementation of a 64-bit stream processor, FT64 (Fei Teng 64), for scientific computing. The carrying out of 64-bit extension design and scientific computing oriented optimization are described in such aspects as instruction set architecture, stream controller, micro controller, ALU cluster, memory hierarchy and interconnection interface here. Second, two kinds of communications as message passing and stream communications are put forward. An interconnection based on the communications is designed for FT64-based high performance computers. Third, a novel stream programming language, SF95 (Stream FORTRAN95), and its compiler, SF95Compiler (Stream FORTRAN95 Compiler), are developed to facilitate the development of scientific applications. Finally, nine typical scientific application kernels are tested and the results show the efficiency of stream architecture for scientific computing.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "architecture; compiler; high performance computing; program language; scientific application; stream processor", } @Article{Hughes:2007:PSA, author = "Christopher J. Hughes and Radek Grzeszczuk and Eftychios Sifakis and Daehyun Kim and Sanjeev Kumar and Andrew P. Selle and Jatin Chhugani and Matthew Holliman and Yen-Kuang Chen", title = "Physical simulation for animation and visual effects: parallelization and characterization for chip multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "220--231", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250690", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We explore the emerging application area of physics-based simulation for computer animation and visual special effects. In particular, we examine its parallelization potential and characterize its behavior on a chip multiprocessor (CMP). Applications in this domain model and simulate natural phenomena, and often direct visual components of motion pictures. We study a set of three workloads that exemplify the span and complexity of physical simulation applications used in a production environment: fluid dynamics, facial animation, and cloth simulation. They are computationally demanding, requiring from a few seconds to several minutes to simulate a single frame; therefore, they can benefit greatly from the acceleration possible with large scale CMPs.\par Starting with serial versions of these applications, we parallelize code accounting for at least 96\% of the serial execution time, targeting a large number of threads. We then study the most expensive modules using a simulated 64-core CMP.\par For the code representing key modules, we achieve parallel scaling of 45x, 50x, and 30x for fluid, face, and cloth simulations, respectively. The modules have a spectrum of parallel task granularity and locking behavior, and all but one are dominated by loop-level parallelism. Many modules operate on streams of data. In some cases, modules iterate over their data, leading to significant temporal locality. This streaming behavior leads to very high on-die and main memory bandwidth requirements. Finally, most modules have little inter-thread communication since they are data-parallel, but a few require heavy communication between data-parallel operations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "characterization; CMP; parallelization; physical simulation", } @Article{Yeh:2007:PAR, author = "Thomas Y. Yeh and Petros Faloutsos and Sanjay J. Patel and Glenn Reinman", title = "{ParallAX}: an architecture for real-time physics", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "232--243", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250691", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Future interactive entertainment applications will feature the physical simulation of thousands of interacting objects using explosions, breakable objects, and cloth effects. While these applications require a tremendous amount of performance to satisfy the minimum frame rate of 30 FPS, there is a dramatic amount of parallelism in future physics workloads. How will future physics architectures leverage parallelism to achieve the real-time constraint?.\par We propose and characterize a set of forward-looking benchmarks to represent future physics load and explore the design space of future physics processors. In response to the demand of this workload, we demonstrate an architecture with a set of powerful cores and caches to provide performance for the serial and coarse-grain parallel components of physics simulation, along with a flexible set of simple cores to exploit fine-grain parallelism. Our architecture combines intelligent, application-aware L2 management with dynamic coupling\slash allocation of simple cores to complex cores. Furthermore, we perform sensitivity analysis on interconnect alternatives to determine how tightly to couple these cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "application specific processor; chip multiprocessor; interactive entertainment; physics based animation; real-time physics; stream processing", } @Article{Kim:2007:AIB, author = "Martha Mercaldi Kim and Mojtaba Mehrara and Mark Oskin and Todd Austin", title = "Architectural implications of brick and mortar silicon manufacturing", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "244--253", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250693", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We introduce a novel chip fabrication technique called 'brick and mortar', in which chips are made from small, pre-fabricated ASIC bricks and bonded in a designer-specified arrangement to an inter-brick communication backbone chip. The goal of brick and mortar assembly is to provide a low-overhead method to produce custom chips, yet with performance that tracks an ASIC more closely than an FPGA. This paper examines the architectural design choices in this chip-design system. These choices include the definition of reasonable bricks, both in functionality and size, as well as the communication interconnect that the I/O cap provides. To do this we synthesize candidate bricks, analyze their area and bandwidth demands, and present an architectural design for the inter-brick communication network. We discuss a sample chip design, a 16-way CMP, and analyze the costs and benefits of designing chips with brick and mortar. We find that this method of producing chips incurs only a small performance loss (8\%) compared to a fully custom ASIC, which is significantly less than the degradation seen from other low-overhead chip options, such as FPGAs. Finally, we measure the effect that architectural design decisions have on the behavior of the proposed physical brick assembly technique, fluidic self-assembly.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "chip assembly; design re-use; interconnect design", } @Article{Amin:2007:APA, author = "Ahmed M. Amin and Mithuna Thottethodi and T. N. Vijaykumar and Steven Wereley and Stephen C. Jacobson", title = "{Aquacore}: a programmable architecture for microfluidics", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "254--265", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250694", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Advances in microfluidic research has enabled lab-on-a-chip (LoC) technology to achieve miniaturization and integration of biological and chemical analyses to a single chip comprising channels, valves, mixers, heaters, separators, and sensors. These miniature instruments appear to offer the rare combination of faster, cheaper, and higher-precision analyses in comparison to conventional bench-scale methods. LoCs have been applied to diverse domains such as proteomics, genomics, biochemistry, virology, cell biology, and chemical synthesis. However, to date LoCs have been designed as application-specific chips which incurs significant design effort, turn-around time, and cost, and degrades designer and user productivity. To address these limitations, we envision a programmable LoC (PLoC) and propose a comprehensive fluidic instruction set, called AquaCore Instruction Set (AIS), and a fluidic microarchitecture, called AquaCore, to implement AIS. We present four key design aspects in which the AIS and AquaCore differ from their computer counterparts, and our design decisions made on the basis of the implications of these differences. We demonstrate the use of the PLoC in a range of domains by hand-compiling real-world microfluidic assays in AIS, and show a detailed breakdown of the execution times for the assays and an estimate of the chip area.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "fluidic; fluidic microarchitecture; instruction set; microfluidics; programmable lab on a chip", } @Article{Wenisch:2007:MSW, author = "Thomas F. Wenisch and Anastasia Ailamaki and Babak Falsafi and Andreas Moshovos", title = "Mechanisms for store-wait-free multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "266--277", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250696", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Store misses cause significant delays in shared-memory multiprocessors because of limited store buffering and ordering constraints required for proper synchronization. Today, programmers must choose from a spectrum of memory consistency models that reduce store stalls at the cost of increased programming complexity. Prior research suggests that the performance gap among consistency models can be closed through speculation--enforcing order only when dynamically necessary. Unfortunately, past designs either provide insufficient buffering, replace all stores with read-modify-write operations, and/or recover from ordering violations via impractical fine-grained rollback mechanisms.\par We propose two mechanisms that, together, enable store-wait-free implementations of any memory consistency model. To eliminate buffer-capacity-related stalls, we propose the scalable store buffer, which places private/speculative values directly into the L1 cache, thereby eliminating the non-scalable associative search of conventional store buffers. To eliminate ordering-related stalls, we propose atomic sequence ordering, which enforces ordering constraints over coarse-grain access sequences while relaxing order among individual accesses. Using cycle-accurate full-system simulation of scientific and commercial applications, we demonstrate that these mechanisms allow the simplified programming of strict ordering while outperforming conventional implementations on average by 32\% (sequential consistency), 22\% (SPARC total store order) and 9\% (SPARC relaxed memory order).", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "memory consistency models; store buffer design", } @Article{Ceze:2007:BBE, author = "Luis Ceze and James Tuck and Pablo Montesinos and Josep Torrellas", title = "{BulkSC}: bulk enforcement of sequential consistency", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "278--289", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250697", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "While Sequential Consistency (SC) is the most intuitive memory consistency model and the one most programmers likely assume, current multiprocessors do not support it. Instead, they support more relaxed models that deliver high performance. SC implementations are considered either too slow or -- when they can match the performance of relaxed models -- too difficult to implement.\par In this paper, we propose Bulk Enforcement of SC (BulkSC), a novel way of providing SC that is simple to implement and offers performance comparable to Release Consistency (RC). The idea is to dynamically group sets of consecutive instructions into chunks that appear to execute atomically and in isolation. The hardware enforces SC at the coarse grain of chunks which, to the program, appears as providing SC at the individual memory access level. BulkSC keeps the implementation simple by largely decoupling memory consistency enforcement from processor structures. Moreover, it delivers high performance by enabling full memory access reordering and overlapping within chunks and across chunks. We describe a complete system architecture that supports BulkSC and show that it delivers performance comparable to RC.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "bulk; chip multiprocessors; memory consistency models; programmability; sequential consistency", } @Article{Diniz:2007:LPC, author = "Bruno Diniz and Dorgival Guedes and Wagner {Meira, Jr.} and Ricardo Bianchini", title = "Limiting the power consumption of main memory", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "290--301", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250699", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The peak power consumption of hardware components affects their power supply, packaging, and cooling requirements. When the peak power consumption is high, the hardware components or the systems that use them can become expensive and bulky. Given that components and systems rarely (if ever) actually require peak power, it is highly desirable to limit power consumption to a less-than-peak power budget, based on which power supply, packaging, and cooling infrastructure scan be more intelligently provisioned.\par In this paper, we study dynamic approaches for limiting the power consumption of main memories. Specifically, we propose four techniques that limit consumption by adjusting the power states of the memory devices, as a function of the load on the memory subsystem. Our simulations of applications from three benchmarks demonstrate that our techniques can consistently limit power to a pre-established budget. Two of the techniques can limit power with very low performance degradation. Our results also show that, when using these superior techniques, limiting power is at least as effective an energy-conservation approach as state-of-the-art techniques explicitly designed for performance-aware energy conservation. These latter results represent a departure from current energy management research and practice.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "main memory; performance; power and energy management", } @Article{Mesa-Martinez:2007:PMV, author = "Francisco Javier Mesa-Martinez and Joseph Nayfach-Battilana and Jose Renau", title = "Power model validation through thermal measurements", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "302--311", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250700", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Simulation environments are an indispensable tool in the design, prototyping, performance evaluation, and analysis of computer systems. Simulator must be able to faithfully reflect the behavior of the system being analyzed. To ensure the accuracy of the simulator, it must be verified and determined to closely match empirical data. Modern processors provide enough performance counters to validate the majority of the performance models; nevertheless, the information provided is not enough to validate power and thermal models.\par In order to address some of the difficulties associated with the validation of power and thermal models, this paper proposes an infrared measurement setup to capture run-time power consumption and thermal characteristics of modern chips. We use infrared cameras with high spatial resolution ($ 10 \times 10 $ $ \mu $ m) and high frame rate (125fps) to capture thermal maps. To generate a detailed power breakdown (leakage and dynamic) for each processor floorplan unit, we employ genetic algorithms. The genetic algorithm finds a power equation for each floorplan block that produces the measured temperature for a given thermal package. The difference between the predicted power and the externally measured power consumption for an AMD Athlon analyzed in this paper has less than 1\% discrepancy. As an example of applicability, we compare the obtained measurements with CACTI power models, and propose extensions to existing thermal models to increase accuracy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "power and thermal measurements", } @Article{Lin:2007:TMM, author = "Jiang Lin and Hongzhong Zheng and Zhichun Zhu and Howard David and Zhao Zhang", title = "Thermal modeling and management of {DRAM} memory systems", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "312--322", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250701", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With increasing speed and power density, high-performance memories, including FB-DIMM (Fully Buffered DIMM) and DDR2 DRAM, now begin to require dynamic thermal management (DTM) as processors and hard drives did. The DTM of memories, nevertheless, is different in that it should take the processor performance and power consumption into consideration. Existing schemes have ignored that. In this study, we investigate a new approach that controls the memory thermal issues from the source generating memory activities - the processor. It will smooth the program execution when compared with shutting down memory abruptly, and therefore improve the overall system performance and power efficiency. For multicore systems, we propose two schemes called adaptive core gating and coordinated DVFS. The first scheme activates clock gating on selected processor cores and the second one scales down the frequency and voltage levels of processor cores when the memory is to be over-heated. They can successfully control the memory activities and handle thermal emergency. More importantly, they improve performance significantly under the given thermal envelope. Our simulation results show that adaptive core gating improves performance by up to 23.3\% (16.3\% on average) on a four-core system with FB-DIMM when compared with DRAM thermal shutdown; and coordinated DVFS with control-theoretic methods improves the performance by up to 18.5\% (8.3\% on average).", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "DRAM memories; thermal management; thermal modeling", } @Article{Tiwari:2007:RPA, author = "Abhishek Tiwari and Smruti R. Sarangi and Josep Torrellas", title = "{ReCycle}: pipeline adaptation to tolerate process variation", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "323--334", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250703", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Process variation affects processor pipelines by making some stages slower and others faster, therefore exacerbating pipeline unbalance. This reduces the frequency attainable by the pipeline. To improve performance, this paper proposes ReCycle, an architectural framework that comprehensively applies cycle time stealing to the pipeline - transferring the time slack of the faster stages to the slow ones by skewing clock arrival times to latching elements after fabrication. As a result, the pipeline can be clocked with a period equal to the average stage delay rather than the longest one. In addition, ReCycle's frequency gains are enhanced with Donor stages, which are empty stages added to 'donate' slack to the slow stages. Finally, ReCycle can also convert slack into power reductions.\par For a 17FO4 pipeline, ReCycle increases the frequency by 12\% and the application performance by 9\% on average. Combining ReCycle and donor stages delivers improvements of 36\% in frequency and 15\% in performance on average, completely reclaiming the performance losses due to variation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "clock skew; pipeline; process variation", } @Article{Sassone:2007:MSR, author = "Peter G. Sassone and Jeff {Rupley II} and Edward Brekelbaum and Gabriel H. Loh and Bryan Black", title = "Matrix scheduler reloaded", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "335--346", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250704", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "From multiprocessor scale-up to cache sizes to the number of reorder-buffer entries, microarchitects wish to reap the benefits of more computing resources while staying within power and latency bounds. This tension is quite evident in schedulers, which need to be large and single-cycle for maximum performance on out-of-order cores. In this work we present two straightforward modifications to a matrix scheduler implementation which greatly strengthen its scalability. Both are based on the simple observation that the wakeup and picker matrices are sparse, even at small sizes; thus small indirection tables can be used to greatly reduce their width and latency. This technique can be used to create quicker iso-performance schedulers (17-58\% reduced critical path) or larger iso-timing schedulers (7-26\% IPC increase). Importantly, the power and area requirements of the additional hardware are likely offset by the greatly reduced matrix sizes and subsuming the functionality of the power-hungry allocation CAMs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "matrix; microarchitecture; picker; scheduler; wakeup", } @Article{Sethumadhavan:2007:LBE, author = "Simha Sethumadhavan and Franziska Roesner and Joel S. Emer and Doug Burger and Stephen W. Keckler", title = "Late-binding: enabling unordered load-store queues", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "347--357", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250705", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Conventional load/store queues (LSQs) are an impediment to both power-efficient execution in superscalar processors and scaling to large-window designs. In this paper, we propose techniques to improve the area and power efficiency of LSQs by allocating entries when instructions issue ('late binding'), rather than when they are dispatched. This approach enables lower occupancy and thus smaller LSQs. Efficient implementations of late-binding LSQs, however, require the entries in the LSQ to be unordered with respect to age. In this paper, we show how to provide full LSQ functionality in an unordered design with only small additional complexity and negligible performance losses. We show that late-binding, unordered LSQs work well for small-window superscalar processors, but can also be scaled effectively to large, kilo-window processors by breaking the LSQs into address-interleaved banks. To handle the increased overflows, we apply classic network flow control techniques to the processor micronetworks, enabling low-overhead recovery mechanisms from bank overflows. We evaluate three such mechanisms: instruction replay, skid buffers, and virtual-channel buffering in the on-chip memory network. We show that for an 80-instruction window, the LSQ can be reduced to 32 entries. For a 1024-instruction window, the unordered, late-binding LSQ works well with four banks of 48 entries each. By applying a Bloom filter as well, this design achieves full hardware memory disambiguation for a 1,024 instruction window while requiring low average power per load and store access of 8 and 12 CAM entries, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "late binding; memory disambiguation; network flow control", } @Article{Leverich:2007:CMS, author = "Jacob Leverich and Hideho Arakida and Alex Solomatnikov and Amin Firoozshahian and Mark Horowitz and Christos Kozyrakis", title = "Comparing memory systems for chip multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "358--368", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250707", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "There are two basic models for the on-chip memory in CMP systems: hardware-managed coherent caches and software-managed streaming memory. This paper performs a direct comparison of the two models under the same set of assumptions about technology, area, and computational capabilities. The goal is to quantify how and when they differ in terms of performance, energy consumption, bandwidth requirements, and latency tolerance for general-purpose CMPs. We demonstrate that for data-parallel applications, the cache-based and streaming models perform and scale equally well. For certain applications with little data reuse, streaming scales better due to better bandwidth use and macroscopic software prefetching. However, the introduction of techniques such as hardware prefetching and non-allocating stores to the cache-based model eliminates the streaming advantage. Overall, our results indicate that there is not sufficient advantage in building streaming memory systems where all on-chip memory structures are explicitly managed. On the other hand, we show that streaming at the programming model level is particularly beneficial, even with the cache-based model, as it enhances locality and creates opportunities for bandwidth optimizations. Moreover, we observe that stream programming is actually easier with the cache-based model because the hardware guarantees correct, best-effort execution even when the programmer cannot fully regularize an application's code.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "chip multiprocessors; coherent caches; locality optimizations; parallel programming; streaming memory", } @Article{Muralimanohar:2007:IDC, author = "Naveen Muralimanohar and Rajeev Balasubramonian", title = "Interconnect design considerations for large {NUCA} caches", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "369--380", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250708", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The ever increasing sizes of on-chip caches and the growing domination of wire delay necessitate significant changes to cache hierarchy design methodologies. Many recent proposals advocate splitting the cache into a large number of banks and employing a network-on-chip (NoC) to allow fast access to nearby banks (referred to as Non-Uniform Cache Architectures--NUCA). Most studies on NUCA organizations have assumed a generic NoC and focused on logical policies for cache block placement, movement, and search. Since wire/router delay and power are major limiting factors in modern processors, this work focuses on interconnect design and its influence on NUCA performance and power. We extend the widely-used CACTI cache modeling tool to take network design parameters into account. With these overheads appropriately accounted for, the optimal cache organization is typically very different from that assumed in prior NUCA studies. To alleviate the interconnect delay bottleneck, we propose novel cache access optimizations that introduce heterogeneity within the inter-bank network. The careful consideration of interconnect choices for a large cache results in a 51\% performance improvement over a baseline generic NoC and the introduction of heterogeneity within the network yields an additional 11-15\% performance improvement.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cache models; interconnect; memory hierarchies; network-on-chip; non-uniform cache architecture", } @Article{Qureshi:2007:AIP, author = "Moinuddin K. Qureshi and Aamer Jaleel and Yale N. Patt and Simon C. Steely and Joel Emer", title = "Adaptive insertion policies for high performance caching", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "381--391", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250709", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The commonly used LRU replacement policy is susceptible to thrashing for memory-intensive workloads that have a working set greater than the available cache size. For such applications, the majority of lines traverse from the MRU position to the LRU position without receiving any cache hits, resulting in inefficient use of cache space. Cache performance can be improved if some fraction of the working set is retained in the cache so that at least that fraction of the working set can contribute to cache hits.\par We show that simple changes to the insertion policy can significantly reduce cache misses for memory-intensive workloads. We propose the LRU Insertion Policy (LIP) which places the incoming line in the LRU position instead of the MRU position. LIP protects the cache from thrashing and results in close to optimal hit rate for applications that have a cyclic reference pattern. We also propose the Bimodal Insertion Policy (BIP) as an enhancement of LIP that adapts to changes in the working set while maintaining the thrashing protection of LIP. We finally propose a Dynamic Insertion Policy (DIP) to choose between BIP and the traditional LRU policy depending on which policy incurs fewer misses. The proposed insertion policies do not require any change to the existing cache structure, are trivial to implement, and have a storage requirement of less than two bytes. We show that DIP reduces the average MPKI of the baseline 1MB 16-way L2 cache by 21\%, bridging two-thirds of the gap between LRU and OPT.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "replacement; set dueling; set sampling; thrashing", } @Article{Karger:2007:PSL, author = "Paul A. Karger", title = "Performance and security lessons learned from virtualizing the {Alpha} processor", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "392--401", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250711", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Virtualization has become much more important throughout the computer industry both to improve security and to support multiple workloads on the same hardware with effective isolation between those workloads. The most widely used chip architecture, the Intel and AMD x86 processors, have begun to support virtualization, but the initial implementations show some limitations. This paper examines the virtualization properties of the Alpha architecture with particular emphasis on features that improve performance and security. It shows how the Alpha's features of PALcode, address space numbers, software handling of translation buffer misses, lack of used and modified bits, and secure handling of unpredictable results all contribute to making virtualization of the Alpha particularly easy. The paper then compares the virtual architecture of the Alpha with Intel's and AMD's virtualization approaches for x86. It also comments briefly on Intel's virtualization technology for Itanium, IBM's zSeries and pSeries hypervisors and Sun's UltraSPARC virtualization. It particularly identifies some differences between translation buffers on x86 and translation buffers on VAX and Alpha that can have adverse performance consequences.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "hypervisors; security; virtual machine monitors; virtualizability", } @Article{Karkhanis:2007:ADA, author = "Tejas S. Karkhanis and James E. Smith", title = "Automated design of application specific superscalar processors: an analytical approach", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "402--411", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250712", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Analytical modeling is applied to the automated design of application-specific superscalar processors. Using an analytical method bridges the gap between the size of the design space and the time required for detailed cycle-accurate simulations. The proposed design framework takes as inputs the design targets (upper bounds on execution time, area, and energy), design alternatives, and one or more application programs. The output is the set of out-of-order superscalar processors that are Pareto-optimal with respect to performance-energy-area. The core of the new design framework is made up of analytical performance and energy activity models, and an analytical model-based design optimization process.\par For a set of benchmark programs and a design space of 2000 designs, the design framework arrives at all performance-energy-area Pareto-optimal design points within 16 minutes on a 2 GHz Pentium-4. In contrast, it is estimated that a na{\"\i}ve cycle-accurate simulation-based exhaustive search would require at least two months to arrive at the Pareto-optimal design points for the same design space.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "analytical model; application specific processors; design optimization; energy model; performance model", } @Article{Phansalkar:2007:ARA, author = "Aashish Phansalkar and Ajay Joshi and Lizy K. John", title = "Analysis of redundancy and application balance in the {SPEC CPU2006} benchmark suite", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "412--423", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250713", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The recently released SPEC CPU2006 benchmark suite is expected to be used by computer designers and computer architecture researchers for pre-silicon early design analysis. Partial use of benchmark suites by researchers, due to simulation time constraints, compiler difficulties, or library or system call issues is likely to happen; but a random subset can lead to misleading results. This paper analyzes the SPEC CPU2006 benchmarks using performance counter based experimentation from several state of the art systems, and uses statistical techniques such as principal component analysis and clustering to draw inferences on the similarity of the benchmarks and the redundancy in the suite and arrive at meaningful subsets.\par The SPEC CPU2006 benchmark suite contains several programs from areas such as artificial intelligence and includes none from the electronic design automation (EDA) application area. Hence there is a concern on the application balance in the suite. An analysis from the perspective of fundamental program characteristics shows that the included programs offer characteristics broader than the EDA programs' space. A subset of 6 integer programs and 8 floating point programs can yield most of the information from the entire suite.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "benchmark; clustering; microprocessor performance counters; SPEC", } @Article{Kim:2007:VPR, author = "Hyesoon Kim and Jos{\'e} A. Joao and Onur Mutlu and Chang Joo Lee and Yale N. Patt and Robert Cohn", title = "{VPC} prediction: reducing the cost of indirect branches via hardware-based dynamic devirtualization", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "424--435", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250715", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Indirect branches have become increasingly common in modular programs written in modern object-oriented languages and virtual machine based runtime systems. Unfortunately, the prediction accuracy of indirect branches has not improved as much as that of conditional branches. Furthermore, previously proposed indirect branch predictors usually require a significant amount of extra hardware storage and complexity, which makes them less attractive to implement.\par This paper proposes a new technique for handling indirect branches, called Virtual Program Counter (VPC) prediction. The key idea of VPC prediction is to treat a single indirect branch as multiple virtual conditional branches in hardware for prediction purposes. Our technique predicts each of the virtual conditional branches using the existing conditional branch prediction hardware. Thus, no separate storage structure is required for predicting indirect branch targets.\par Our evaluation shows that VPC prediction improves average performance by 26.7\% compared to a commonly-used branch target buffer based predictor on 12 indirect branch intensive applications. VPC prediction achieves the performance improvement provided by at least a 12KB (and usually a 192KB) tagged target cache predictor on half of the examined applications. We show that VPC prediction can be used with any existing conditional branch prediction mechanism and that the accuracy of VPC prediction improves when a more accurate conditional branch predictor is used.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "devirtualization; indirect branch prediction; virtual functions", } @Article{Hilton:2007:GCI, author = "Andrew D. Hilton and Amir Roth", title = "{Ginger}: control independence using tag rewriting", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "436--447", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250716", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The negative performance impact of branch mis-predictions can be reduced by exploiting control independence (CI). When a branch mis-predicts, the wrong-path instructions up to the point where control converges with the correct path are selectively squashed and replaced with correct-path instructions. Instructions beyond the convergence-point-the branch's control-independent (CI) instructions-are spared from squashing. Exploiting CI requires updating the input data dependences of CI instructions to reflect the selective removal and insertion of logically older instructions and transitively re-dispatching those CI instructions whose inputs have changed. This capability is generally called out-of-order renaming. Previously proposed CI designs use out-of-order renaming schemes that either consume excessive rename/dispatch bandwidth, can only be applied in limited cases, or incur a cost even when the branch would be correctly predicted.\par Ginger is a CI design that is both general and bandwidth efficient. Ginger implements out-of-order renaming using tag rewriting, re-linking the input dependences of CI instructions as they sit in the window. To do this, Ginger halts the pipeline uses the idle map table read and write ports and the issue queue match lines and write lines to perform a register-tag 'search-and-replace' operation. After a few cycles, the pipeline restarts and execution resumes with correct data dependences. Cycle-level simulation shows that Ginger out-performs previous CI designs, yielding geometric mean speedups over an aggressive non-CI processor of 5\%, 12\%, and 11\%-on SPECint2000, MediaBench, and Comm-Bench-with speedups of 15\% or greater on 11 of 46 programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "branch misprediction; control independence; out-of-order renaming; selective re-dispatch", } @Article{Al-Zawawi:2007:TCI, author = "Ahmed S. Al-Zawawi and Vimal K. Reddy and Eric Rotenberg and Haitham H. Akkary", title = "Transparent control independence {(TCI)}", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "448--459", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250717", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Superscalar architectures have been proposed that exploit control independence, reducing the performance penalty of branch mispredictions by preserving the work of future misprediction-independent instructions. The essential goal of exploiting control independence is to completely decouple future misprediction-independent instructions from deferred misprediction-dependent instructions. Current implementations fall short of this goal because they explicitly maintain program order among misprediction-independent and misprediction-dependent instructions. Explicit approaches sacrifice design efficiency and ultimately performance.\par We observe it is sufficient to emulate program order. Potential misprediction-dependent instructions are singled out a priori and their unchanging source values are checkpointed. These instructions and values are set aside as a 'recovery program'. Checkpointed source values break the data dependencies with co-mingled misprediction-independent instructions - now long since gone from the pipeline - achieving the essential decoupling objective. When the mispredicted branch resolves, recovery is achieved by fetching the self-sufficient, condensed recovery program. Recovery is effectively transparent to the pipeline, in that speculative state is not rolled back and recovery appears as a jump to code. A coarse-grain retirement substrate permits the relaxed order between the decoupled programs. Transparent control independence (TCI) yields a highly streamlined pipeline that quickly recycles resources based on conventional speculation, enabling a large window with small cycle-critical resources, and prevents many mispredictions from disrupting this large window.\par TCI achieves speedups as high as 64\% (16\% average) and 88\% (22\% average) for 4-issue and 8-issue pipelines, respectively, among 15 SPEC integer benchmarks. Factors that limit the performance of explicitly ordered approaches are quantified.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "branch prediction; checkpoints; control independence; selective re-execution; selective recovery; speculation", } @Article{Wang:2007:EAA, author = "Nicholas J. Wang and Aqeel Mahesri and Sanjay J. Patel", title = "Examining {ACE} analysis reliability estimates using fault-injection", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "460--469", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250719", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "ACE analysis is a technique to provide an early reliability estimate for microprocessors. ACE analysis couples data from abstract performance models with low level design details to identify and rule out transient faults that will not cause incorrect execution. While many transient faults are analyzable in ACE analysis frameworks, some are not. As a result, ACE analysis is conservative and provides a lower bound for the reliability of a processor design. Bounding the reliability of a design is useful since it can guarantee that the given design will meet reliability goals.\par In this work, we quantify and identify the sources of ACE analysis conservatism by comparing an ACE analysis methodology against a rigorous fault-injection study. We evaluate two flavors of ACE analysis: a 'simple' analysis and a refined analysis, finding that even the refined analysis overestimates the soft error vulnerability of an instruction scheduler by 2-3x. The conservatism stems from two key sources: from lack of detail in abstract performance models and from what we term Y-Bits, a result of the single-pass simulation methodology that is typical of ACE analysis. We also examine the efficacy of applying ACE analysis to a class of 'partial coverage' error mitigation techniques. In particular, we perform a case study on one such technique and extrapolate our findings to others.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "fault tolerance; measurement techniques; microprocessors; soft errors", } @Article{Aggarwal:2007:CIB, author = "Nidhi Aggarwal and Parthasarathy Ranganathan and Norman P. Jouppi and James E. Smith", title = "Configurable isolation: building high availability systems with commodity multi-core processors", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "470--481", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250720", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "High availability is an increasingly important requirement for enterprise systems, often valued more than performance. Systems designed for high availability typically use redundant hardware for error detection and continued uptime in the event of a failure. Chip multiprocessors with an abundance of identical resources like cores, cache and interconnection networks would appear to be ideal building blocks for implementing high availability solutions on chip. However, doing so poses significant challenges with respect to error containment and faulty component replacement. Increasing silicon and transient fault rates with future technology scaling exacerbate the problem. This paper proposes a novel, cost-effective, architecture for high availability systems built from future multi-core processors. We propose a new chip multiprocessor architecture that provides configurable isolation for fault containment and component retirement, based upon cost-effective modifications to commodity designs. The design is evaluated for a state-of-the-art industrial fault model and the proposed architecture is shown to provide effective fault isolation and graceful degradation even when the failure rate is high.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "chip multiprocessors; fault isolation; high availability", } @Article{Dalton:2007:RFI, author = "Michael Dalton and Hari Kannan and Christos Kozyrakis", title = "{Raksha}: a flexible information flow architecture for software security", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "482--493", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250722", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "High-level semantic vulnerabilities such as SQL injection and cross-site scripting have surpassed buffer overflows as the most prevalent security exploits. The breadth and diversity of software vulnerabilities demand new security solutions that combine the speed and practicality of hardware approaches with the flexibility and robustness of software systems.\par This paper proposes Raksha, an architecture for software security based on dynamic information flow tracking (DIFT). Raksha provides three novel features that allow for a flexible hardware/software approach to security. First, it supports flexible and programmable security policies that enable software to direct hardware analysis towards a wide range of high-level and low-level attacks. Second, it supports multiple active security policies that can protect the system against concurrent attacks. Third, it supports low-overhead security handlers that allow software to correct, complement, or extend the hardware-based analysis without the overhead associated with operating system traps.\par We present an FPGA prototype for Raksha that provides a full featured Linux workstation for security analysis. Using unmodified binaries for real-world applications, we demonstrate that Raksha can detect high-level attacks such as directory traversal, command injection, SQL injection, and cross-site scripting as well as low-level attacks such as buffer overflows. We also show that low overhead exception handling is critical for analyses such as memory corruption protection in order to address false positives that occur due to the diverse code patterns in frequently used software.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "dynamic; semantic vulnerabilities; software security", } @Article{Wang:2007:NCD, author = "Zhenghong Wang and Ruby B. Lee", title = "New cache designs for thwarting software cache-based side channel attacks", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "494--505", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250723", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Software cache-based side channel attacks are a serious new class of threats for computers. Unlike physical side channel attacks that mostly target embedded cryptographic devices, cache-based side channel attacks can also undermine general purpose systems. The attacks are easy to perform, effective on most platforms, and do not require special instruments or excessive computation power. In recently demonstrated attacks on software implementations of ciphers like AES and RSA, the full key can be recovered by an unprivileged user program performing simple timing measurements based on cache misses.\par We first analyze these attacks, identifying cache interference as the root cause of these attacks. We identify two basic mitigation approaches: the partition-based approach eliminates cache interference whereas the randomization-based approach randomizes cache interference so that zero information can be inferred. We present new security-aware cache designs, the Partition-Locked cache (PLcache) and Random Permutation cache (RPcache), analyze and prove their security, and evaluate their performance. Our results show that our new cache designs with built-in security can defend against cache-based side channel attacks in general-rather than only specific attacks on a given cryptographic algorithm-with very little performance degradation and hardware cost.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cache; computer architecture; processor; security; side channel; timing attacks", } @Article{Soundararajan:2007:MBV, author = "Niranjan Kumar Soundararajan and Angshuman Parashar and Anand Sivasubramaniam", title = "Mechanisms for bounding vulnerabilities of processor structures", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "506--515", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250725", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Concern for the increasing susceptibility of processor structures to transient errors has led to several recent research efforts that propose architectural techniques to enhance reliability. However, real systems are typically required to satisfy hard reliability budgets, and barring expensive full-redundancy approaches, none of the proposed solutions treat any reliability budgets or bounds as hard constraints. Meeting vulnerability bounds requires monitoring vulnerabilities of processor structures and taking appropriate actions whenever these bounds are violated. This mandates treating reliability as a first-order microarchitecture design constraint, while optimizing performance as long as reliability requirements are satisfied. This paper makes three key contributions towards this goal: (i) we present a simple infrastructure to monitor and provide upper bounds on the vulnerabilities of key processor structures at cycle-level fidelity; (ii) we propose two distinct control mechanisms - throttling and selective redundancy - to proactively and/or reactively bound the vulnerabilities to any limit specified by the system designer; (iii) within this framework, we propose a novel adaptation of Out-of-Order Commit for vulnerability reduction, which automatically provides additional leverage for the control mechanisms to boost performance while remaining within the reliability budget.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "microarchitecture; redundant threading; transient faults", } @Article{Walcott:2007:DPA, author = "Kristen R. Walcott and Greg Humphreys and Sudhanva Gurumurthi", title = "Dynamic prediction of architectural vulnerability from microarchitectural state", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "516--527", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250726", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Transient faults due to particle strikes are a key challenge in microprocessor design. Driven by exponentially increasing transistor counts, per-chip faults are a growing burden. To protect against soft errors, redundancy techniques such as redundant multithreading (RMT) are often used. However, these techniques assume that the probability that a structural fault will result in a soft error (i.e., the Architectural Vulnerability Factor (AVF)) is 100 percent, unnecessarily draining processor resources. Due to the high cost of redundancy, there have been efforts to throttle RMT at runtime. To date, these methods have not incorporated an AVF model and therefore tend to be ad hoc. Unfortunately, computing the AVF of complex microprocessor structures (e.g., the ISQ) can be quite involved.\par To provide probabilistic guarantees about fault tolerance, we have created a rigorous characterization of AVF behavior that can be easily implemented in hardware. We experimentally demonstrate AVF variability within and across the SPEC2000 benchmarks and identify strong correlations between structural AVF values and a small set of processor metrics. Using these simple indicators as predictors, we create a proof-of-concept RMT implementation that demonstrates that AVF prediction can be used to maintain a low fault tolerance level without significant performance impact.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "architecture vulnerability factor; microarchitecture; performance; redundant multithreading; reliability", } @Article{Aggarwal:2007:ISI, author = "Aneesh Aggarwal and Pradip Bose and Mohamed Zahran", title = "Introduction to the special issue on the {2006 Reconfigurable and Adaptive Architecture Workshop}", journal = j-COMP-ARCH-NEWS, volume = "35", number = "3", pages = "1--1", month = jun, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1294313.1294317", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:27 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The papers that follow comprise the proceedings of the first Reconfigurable and Adaptive Architecture Workshop (RAAW 2006) that was held in conjunction with the 39$^{th}$ International Conference on Microarchitecture in Orlando, Florida.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bellas:2007:MSA, author = "Nikolaos Bellas and Sek M. Chai and Malcolm Dwyer and Dan Linzmeier", title = "Mapping streaming architectures on reconfigurable platforms", journal = j-COMP-ARCH-NEWS, volume = "35", number = "3", pages = "2--8", month = jun, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1294313.1294318", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:27 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Hardware accelerators, used as application-specific extensions to the computational capabilities of a system, are efficient mechanisms to enhance the performance and reduce the power dissipation in a System On Chip (SoC). These accelerators execute on the computationally critical part of the application, and offload computations from the scalar processors. In this paper, we present a design automation tool that generates accelerators based on a given application kernel. The accelerators are processing streaming data, and support a programming model which can naturally express a large number of embedded applications, and which results in efficient and fast hardware implementations. We demonstrate the applicability of the tool for architectural space exploration for a number of media applications, with results on area, throughput, and clock speeds.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Labrecque:2007:CCG, author = "Martin Labrecque and Peter Yiannacouras and J. Gregory Steffan", title = "Custom code generation for soft processors", journal = j-COMP-ARCH-NEWS, volume = "35", number = "3", pages = "9--19", month = jun, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1294313.1294319", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:27 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Embedded systems designers that use FPGAs are increasingly including soft processors in their designs (configurable processors built in the programmable logic of the FPGA). While there has been a significant amount of research on adding custom instructions and accelerators to soft processors, these are typically used to extend an unmodified base ISA targeted by generic compilation such as with unmodified gcc. In this paper we explore several opportunities for the compiler to optimize the code generated for soft processors through application-specific customization of the base ISA---techniques that are orthogonal to adding custom instructions. In particular we explore: (i) low level software-hardware trade-offs between basic instructions; (ii) the utility of ISA-specific features---in particular for the delay slots and Hi/Lo registers in the MIPS ISA; and (iii) application specific register management. We find that through these techniques that have no hardware cost we can improve the area efficiency of soft processors by 12\% on average across a suite of benchmarks, and by up to 47\% in the best case.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Suri:2007:IIL, author = "Tameesh Suri", title = "Improving instruction level parallelism through reconfigurable units in superscalar processors", journal = j-COMP-ARCH-NEWS, volume = "35", number = "3", pages = "20--27", month = jun, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1294313.1294320", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:27 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With reducing feature sizes, more transistors can be integrated on the chip. The increased transistor budget can be utilized to improve the instruction level parallelism (ILP) exploited from the processor. However, the transistors cannot be used to arbitrarily increase the processor width and size in the hope of exploiting better ILP. In this paper, we propose an architecture where the superscalar datapath is tightly coupled with a reconfigurable unit (RFU). The reconfiguration unit is configured to execute the traces of dynamic instructions that are frequently executed. To address the data dependency issues between the instructions in the superscalar and the RFU, we propose to execute the trace on the RFU with predicted values. When the trace instructions reach the issue queue in the superscalar, the predictions are validated. In this technique, performance improvement is obtained for correct prediction, whereas no performance degradation is incurred for mispredictions. With this architecture, we observe an average instructions per cycle (IPC) improvement of about 11\% over the simulated SPEC 2000 benchmarks, using a very small last value data value predictor.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Najaf-abadi:2007:ACE, author = "Hashem H. Najaf-abadi and Eric Rotenberg", title = "Architectural {\em contesting\/}: exposing and exploiting temperamental behavior", journal = j-COMP-ARCH-NEWS, volume = "35", number = "3", pages = "28--35", month = jun, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1294313.1294321", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:27 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Previous studies have proposed techniques to dynamically change the architecture of a processor to better suit the characteristics of the workload at hand. However, all such approaches are prone to a fundamental trade-off between the architectural diversity they can provide and the latency of architectural change, their fixed-configuration performance and the complexity of finding the best architectural configuration for the workload at hand. In this study we argue that the full potential of dynamic architectural customization can only be achieved by diminishing the effect of the degree of available architectural diversity on the aforementioned performance factors.\par The performance of a statically designed processing core in a heterogeneous multi-core system is independent of the architectural diversity available. In addition, it is apparent that concurrent execution of code on differently architected cores automatically reveals which architecture is more suitable for the characteristics of a particular workload.\par We therefore propose architectural contesting; the redundant execution of code on a number of differently architected processors (each customized for a different set of workload characteristics) in a leader follower arrangement, such that the leader and follower cores continuously shift roles as one core or the other becomes more favorable for new code phases. In this manner effective execution is naturally transferred from one static architecture to the other with little latency.\par In this study, we show that the contesting of only processor width can yield an average speedup of 7.5\% and up to 12.5\% in integer SPEC benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tseng:2007:DHS, author = "Kuo-Kun Tseng and Ying-Dar Lin and Tsern-Huei Lee and Yuan-Cheng Lai", title = "Deterministic high-speed root-hashing automaton matching coprocessor for embedded network processor", journal = j-COMP-ARCH-NEWS, volume = "35", number = "3", pages = "36--43", month = jun, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1294313.1294314", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:27 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "While string matching plays an important role in deep packet inspection applications, its software algorithms are insufficient to meet the demands of high-speed performance. Accordingly, we were motivated to propose fast and deterministic performance root-hashing automaton matching (RHAM) coprocessor for embedded network processor. Although automaton algorithms are robust with deterministic matching time, there is still plenty of room for improvement of their average-case performance. The proposed RHAM employs novel root-hashing technique to accelerate automaton matching. In our experiment, RHAM is implemented in a prevalent automaton algorithm, Aho--Corasick (AC) which is often used in many packet inspection applications. Compared to the original AC, RHAM only requires extra vector size in 48 Kbytes for root-hashing, and has about 900\% and 420\% outperformance for 20,000 URLs and 10,000 virus patterns respectively. Implementation of RHAM FPGA can perform at the rate of 12.6 Gbps with the pattern amount in 34,215 bytes. This is superior to all previous matching hardware in terms of throughput and pattern set.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "coprocessor; finite automaton; hashing; packet inspection; string matching", } @Article{Sibai:2007:PAW, author = "Fadi N. Sibai", title = "Performance analysis and workload characterization of the {$3$DMark05} benchmark on modern parallel computer platforms", journal = j-COMP-ARCH-NEWS, volume = "35", number = "3", pages = "44--52", month = jun, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1294313.1294315", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:27 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With ever increasing CPU and graphics card speeds, and improved sophistication, stunning visual effects, and growing scene detail and real life-like content of 3D games, 3DMark{\reg} emerged as the leading PC benchmark for 3D gaming performance with several millions of worldwide downloads. Its tests are at the cutting edge of consumer graphics and push the limit of 3D rendering with spectacular scenes, and state of the art lighting techniques. The benchmark scores help quickly differentiate the platforms with state of the art graphic cards and processors from those with older components. In this paper, we analyze the scaling of the 3DMark{\reg}05 benchmark with CPU frequency, number of CPUs, number of GPUs, and number of threads supported by the hardware. We also characterize the benchmark's workload. These results reveal that the benchmark scales well indicating that 3D games if implemented with multiple Physics and Artificial Intelligence or other relevant content threads should show good scaling too on multi-CPU and multi-GPU platforms. The characterization results reveal the close dependence of 3D graphics applications on the memory subsystem's performance as 1 out of 2 instructions is a load or store instruction. The results also revealed that there is a direct correlation with the Game Tests' performance and the number of cache memory read misses per instruction retired, the number of stores retired per instruction retired, the number of polygons per Draw*Primitive; and the number of set-vertex shader calls per frame. All these events relate to the memory subsystem performance generally linking the 3D graphics applications' performance and the 3DMark{\reg} overall score to the platform's memory performance. Salient microarchitectural performance events of the CPU tests were also memory-related.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "3D graphics performance; multiple CPU and GPU core platforms; workload characterization", } @Article{Thorson:2007:INb, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "35", number = "3", pages = "53--55", month = jun, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1294313.1294323", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:27 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This column consists of selected traffic from the comp.arch newsgroup, a forum for discussion of computer architecture on the Internet---an international computer network.\par As always, the opinions expressed in this column are the personal views of the authors, and do not necessarily represent the institutions to which they are affiliated.\par Text which sets the context of a message appears underlined or in italics; this is usually text the author has quoted from earlier messages. The code-like expressions below the authors' names are their addresses on Internet.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bartolini:2007:MPD, author = "S. Bartolini and P. Foglia and C. A. Prete", title = "{MEmory} performance: {DEaling} with applications, systems and architecture", journal = j-COMP-ARCH-NEWS, volume = "35", number = "4", pages = "4--5", month = sep, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1327312.1327314", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:50:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this issue, we present the papers from MEDEA-2006 Workshop [3] held in conjunction with the IEEE-ACM International Conference on Parallel Architectures and Compilation Techniques (PACT-2006) [1,2].", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Medea 2006 workshop.", } @Article{Lorton:2007:ABL, author = "K. Patrick Lorton and David S. Wise", title = "Analyzing block locality in {Morton}-order and {Morton}-hybrid matrices", journal = j-COMP-ARCH-NEWS, volume = "35", number = "4", pages = "6--12", month = sep, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1327312.1327315", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:50:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As the architectures of computers change, introducing more caches onto multicore chips, even more locality becomes necessary. With the bandwidth between caches and RAM now even more valuable, additional locality from new matrix representations will be important to keep multiple processors busy. The default storage representations of both C and Fortran, row- and column-major respectively, have fundamental deficiencies with many matrix computations. By switching the storage representation from Cartesian to block indices, one is able to take better advantage of cache locality at all levels from L1 to paging. This paper only changes storage representation from row-major to Morton-hybrid, and applies it to matrix multiplication. Its purpose is to show that, even with only traditional iterative algorithms, simply changing storage representation offers significant speedups.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "Cholesky factorization; Morton order; quadtrees", remark = "Medea 2006 workshop.", } @Article{Deris:2007:ICE, author = "Kaveh Jokar Deris and Amirali Baniasadi", title = "Investigating cache energy and latency break-even points in high performance processors", journal = j-COMP-ARCH-NEWS, volume = "35", number = "4", pages = "13--20", month = sep, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1327312.1327316", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:50:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this work we study how cache complexity impacts energy and performance in high performance processors. Moreover, we estimate cache energy budget for two high performance processors. We calculate energy and latency break-even points for realistic and ideal cache organizations for different applications. We show that design efforts made to reduce cache miss rate are only justifiable from the energy and performance point of view only if the associated latency and energy overhead remain below the calculated break-even points.\par Furthermore, we show that, for the processors and applications studied here, the instruction cache has a lower latency break-even point compared to the data cache. However, investing energy in the data cache is likely to result in better energy efficiency compared to the instruction cache.\par We also study alternative cache configurations for different processors and investigate if such alternatives would improve energy efficiency.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Medea 2006 workshop.", } @Article{Yan:2007:EIC, author = "Jun Yan and Wei Zhang", title = "Evaluating instruction cache vulnerability to transient errors", journal = j-COMP-ARCH-NEWS, volume = "35", number = "4", pages = "21--28", month = sep, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1327312.1327317", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:50:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Recent research shows that microprocessors are increasingly susceptible to transient errors. In order to protect microprocessors cost-effectively, the first step is to accurately understand the impact of transient errors on the system reliability. While many research efforts have been focused on studying the vulnerability of data caches and other on-chip hardware components, instruction caches have received less attention. However, instructions are read every cycle, any undetected or uncorrected soft errors in instructions can lead to erroneous computation, wrong control flow or system crash.\par This paper studies the instruction cache vulnerability by considering both the raw SRAM rate and the cache vulnerability factor. Based on the concept of cache vulnerability factor, we also investigate the impact of different cache configuration parameters on the reliability of instruction caches. We find that on average 67.5\% of instruction cache soft errors can be masked by the I-cache itself without impacting other system components. While quantifying the instruction cache vulnerability itself does not solve the reliability problem of instruction cache against transient errors, we believe this work can provide useful insights for designers to develop cost-effective solutions to protect I-caches and to optimally balance the reliability of instruction caches with other system goals, such as cost, performance and energy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Medea 2006 workshop.", } @Article{Ramirez:2007:EST, author = "Tanaus{\'u} Ram{\'\i}rez and Alex Pajuelo and Oliverio J. Santana and Mateo Valero", title = "Energy saving through a simple load control mechanism", journal = j-COMP-ARCH-NEWS, volume = "35", number = "4", pages = "29--36", month = sep, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1327312.1327318", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:50:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "To alleviate the memory wall problem, current architectural trends suggest implementing large instruction windows able to maintain a high number of in-flight instructions. However, the benefits achieved by these recent proposals may be limited because more instructions are executed down the wrong path of a mispredicted branch. The larger number of misspeculated instructions involves increasing the energy consumed compared to traditional designs with smaller instruction windows. Our analysis shows that, for some SPEC2000 integer benchmarks, up to 2, 5X wrong-path load instructions are executed when the instruction window of a 4-way superscalar processor is increased from 256 to 1024 entries.\par This paper describes a simple speculative control technique to prevent wrong-path load instructions from being executed. Our technique extends the functionality of the load-store queue to block those load instructions that depend on a hard-to-predict conditional branch until it is resolved. If the branch is actually mispredicted, unnecessary cache misses can be avoided, saving energy down the wrong path. Furthermore, instructions that depend on a blocked load are not issued because their source values are not available, which also saves dynamic energy. Our results show that the proposed mechanism reduces, on average, up to 26\% misspeculated load instructions and 18\% wrong-path instructions without any performance loss.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "branch prediction; confidence estimation; energy saving; kilo-instruction processors", remark = "Medea 2006 workshop.", } @Article{Ramos:2007:DPC, author = "Luis M. Ramos and Jos{\'e} Luis Briz and Pablo E. Ib{\'a}{\~n}ez and Victor Vi{\~n}als", title = "Data prefetching in a cache hierarchy with high bandwidth and capacity", journal = j-COMP-ARCH-NEWS, volume = "35", number = "4", pages = "37--44", month = sep, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1327312.1327319", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:50:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper we evaluate four hardware data prefetchers in the context of a high-performance three-level on chip cache hierarchy with high bandwidth and capacity. We consider two classic prefetchers (Sequential Tagged and Stride) and two correlating prefetchers: PC/DC, a recent method with a superior score and low-sized tables, and P-DFCM, a new method. Like PC/DC, P-DFCM focuses on local delta sequences, but it is based on the DFCM value predictor. We explore different prefetch degrees and distances. Running SPEC2000, Olden and IAbench applications, results show that this kind of cache hierarchy turns prefetching aggressiveness into success for the four prefetchers. Sequential Tagged is the best, and deserves further attention to cut it losses in some applications. PC/DC results are matched or even improved by P-DFCM, using far fewer accesses to tables while keeping sizes low.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "hardware data prefeching", remark = "Medea 2006 workshop.", } @Article{Dybdahl:2007:LBR, author = "Haakon Dybdahl and Per Stenstr{\"o}m and Lasse Natvig", title = "An {LRU}-based replacement algorithm augmented with frequency of access in shared chip-multiprocessor caches", journal = j-COMP-ARCH-NEWS, volume = "35", number = "4", pages = "45--52", month = sep, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1327312.1327320", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:50:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper proposes a new replacement algorithm to protect cache lines with potential future reuse from being evicted. In contrast to the recency based approaches used in the past (LRU for example), our algorithm also uses the notion of frequency of access. Instead of evicting the least recently used block, our algorithm identifies among a set of LRU blocks the one that is also least-frequently-used (according to a heuristic) and chooses that as a victim. We have implemented this replacement algorithm in a detailed simulation model of a chip multiprocessor system driven by SPEC2000 benchmarks. We have found that the new scheme improves performance for memory intensive applications. Moreover, as compared to other attempts, our replacement algorithm provides robust improvements across all benchmarks. We have also extended an earlier scheme proposed by Wong and Baer so it is switched off when performance is not improved. Our results show that this makes the scheme much more suitable for CMP configurations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Medea 2006 workshop.", } @Article{Bardine:2007:IPE, author = "A. Bardine and P. Foglia and G. Gabrielli and C. A. Prete and P. Stenstr{\"o}m", title = "Improving power efficiency of {D-NUCA} caches", journal = j-COMP-ARCH-NEWS, volume = "35", number = "4", pages = "53--58", month = sep, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1327312.1327321", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:50:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "D-NUCA caches are cache memories that, thanks to banked organization, broadcast search and promotion/demotion mechanism, are able to tolerate the increasing wire delay effects introduced by technology scaling. As a consequence, they will outperform conventional caches (UCA, Uniform Cache Architectures) in future generation cores.\par Due to the promotion/demotion mechanism, we have found that, in a D-NUCA cache, the distribution of hits on the ways varies across applications as well as across different execution phases within a single application. In this paper, we show how such a behavior can be utilized to improve D-NUCA power efficiency as well as to decrease its access latencies. In particular, we propose a new D-NUCA structure, called Way Adaptable D-NUCA cache, in which the number of active (i.e. powered-on) ways is dynamically adapted to the need of the running application. Our initial evaluation shows that a consistent reduction of both the average number of active ways (42\% in average) and the number of bank access requests (29\% in average) is achieved, without significantly affecting the IPC.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "Medea 2006 workshop.", } @Article{Thorson:2007:INc, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "35", number = "4", pages = "59--62", month = sep, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1327312.1327323", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:50:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This column consists of selected traffic from the comp. arch newsgroup, a forum for discussion of computer architecture on the Internet---an international computer network.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kise:2007:SIA, author = "Kenji Kise and Toshinori Sato and Hironori Nakajo", title = "Special issue: {ALPS'07 -- Advanced Low Power Systems}: Introduction", journal = j-COMP-ARCH-NEWS, volume = "35", number = "5", pages = "1--2", month = dec, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1360464.1360469", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:13 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this issue, we present the papers from the proceedings of the 2nd International Workshop on Advanced Low Power Systems (ALPS 2007) that was held in conjunction with the 21st International Conference on Supercomputing in Seattle.\par 'Thoughtfulness' is an important keyword in the both current and future technologies in all over the world: Thoughtful to human being, thoughtful to our surroundings, thoughtful to the earth, and so on. For the thoughtfulness, Low-power is believed to be one of the most indispensable keyword. The ALPS workshop focuses on the current technological challenges in developing low-power and power-aware computing systems ranging from servers to embedded devices. The goal of the workshop is to bring all aspects of power-aware computing from industry and academia.\par This year, we have one invited talk entitled 'An Under 2W 100GOPS Video Recognition Processor Based on a Linear Array of 128 4-Way VLIW Processing Elements' by Shorin Kyo (NEC Corporation) and 6 papers selected based on the full paper review by the program committee members.\par The first set of papers discusses low-power designs. We have three papers: 'Optimal Pipeline Depth with Pipeline Stage Unification Adoption' by Jun Yao, Hajime Shimada, Shinobu Miwa, and Shinji Tomita, 'VCLEARIT: A VLSI CMOS Circuit Leakage Reduction Technique For Nanoscale Technologies' by Preetham Lakshmikanthan and Adrian Nunez, and 'Leakage Energy Reduction in Cache Memory by Data Compression' by Kiyofumi Tanaka and Takahiro Kawahara.\par The second set of papers: 'Preventing Timing Errors on Register Writes: Mechanisms of Detections and Recoveries' by Hidetsugu Irie, Ken Sugimoto, Masahiro Goshima, and Suichi Sakai, 'Not Multi-, but Many-Core: Designing Integral Parallel Architectures for Embedded Computation' by Mihaela Malita, Gheorghe Stefany, and Dominique Thi{\'e}baut, and 'Fine-grain Compensation Method with Consideration of Trade-offs between Computation and Data Transfer for Power Consumption' by Takefumi Miyoshi and Nobuhiko Sugino, covers reliability, many-core and parallelization issues.\par All papers here are going to create the way to the new aspects of low-power systems. We hope you will find the papers of this special issue of Computer Architecture News to be stimulating and that you will be inspired to contribute your efforts to the future low power systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yao:2007:OPD, author = "Jun Yao and Shinobu Miwa and Hajime Shimada and Shinji Tomita", title = "Optimal pipeline depth with pipeline stage unification adoption", journal = j-COMP-ARCH-NEWS, volume = "35", number = "5", pages = "3--9", month = dec, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1360464.1360470", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:13 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "To find the optimal pipeline design point by considering both performance and power objectives has been one focus of interest in recent researches. However, we found that previous papers did not consider deepening or shrinking pipeline depth dynamically during the program execution. In this paper, with the adoption of the earlier proposed Pipeline Stage Unification (PSU) method, we studied the relationship between power/performance and pipeline depth in processors with a pipeline of multi-usable depths. Our evaluation results of SPECint2000 benchmarks shown in this paper illustrate that the PSU adoption can achieve good efficiency for platforms which concern both energy and performance, even after the utilization of complex clock gating.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "pipeline design point; pipeline stage unification; power/performance", } @Article{Lakshmikanthan:2007:VVC, author = "Preetham Lakshmikanthan and Adrian Nu{\~n}ez", title = "{VCLEARIT}: a {VLSI CMOS} circuit leakage reduction technique for nanoscale technologies", journal = j-COMP-ARCH-NEWS, volume = "35", number = "5", pages = "10--16", month = dec, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1360464.1360471", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:13 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Leakage power loss is a major concern in deep-submicron technologies as it drains the battery even when a circuit is completely idle. In this paper, we first present a novel leakage reduction technique and then compare and contrast it with other well established leakage reduction techniques. Our leakage reduction technique achieves cancellation of leakage effects in both the pull-up network (PUN) as well as the pull-down network (PDN) for CMOS circuits. It involves voltage balancing in the PUN and PDN paths using a combination of high- V$_T$ (high voltage threshold) and standard- V$_T$ sleep transistors. Experiments conducted on a variety of multi-level combinational MCNC'91 benchmarks show significant savings in leakage power (upto 3 orders of magnitude), with lesser area and delay penalty using our leakage reduction technique when compared to other techniques.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tanaka:2007:LER, author = "Kiyofumi Tanaka and Takahiro Kawahara", title = "Leakage energy reduction in cache memory by data compression", journal = j-COMP-ARCH-NEWS, volume = "35", number = "5", pages = "17--24", month = dec, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1360464.1360472", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:13 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Cache memory is effective in bridging a growing speed gap between a processor and relatively slow external main memory. Almost all of today's commercial processors, not only high-performance microprocessors but embedded ones, have on-chip cache memories. However, energy consumption in the cache memory would approach or exceed 50\% of the total consumption by the processors, which leads to a serious problem in terms of allowable temperature and performance improvement. An important point to note is that, in the near future, static (leakage) energy will dominate the energy consumption in deep sub-micron processes. In this paper, we propose cache memory architecture that exploits gated-Vdd control per cache block and a dynamic data compression scheme in the secondary cache, and achieves efficient reduction of static energy consumed by the secondary cache memory. In the simulation using SPEC95 integer benchmarks, our technique reduced about 45\% of leakage energy in the cache at maximum, and about 28\% on average.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cache memory; data compression; gated-Vdd; leakage energy", } @Article{Irie:2007:PTE, author = "Hidetsugu Irie and Ken Sugimoto and Masahiro Goshima and Shuich Sakai", title = "Preventing timing errors on register writes: mechanisms of detections and recoveries", journal = j-COMP-ARCH-NEWS, volume = "35", number = "5", pages = "25--31", month = dec, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1360464.1360473", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:13 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "To deal with the increasing variations of the intra-chip transistors, one promising approach is to dynamically detect and recover the timing-errors with microarchitecutre. This will induce dependability and efficiency into microprocessors because it allows VLSI to operate at the optimum frequency and voltage while ensuring accuracy.\par A few approaches for dynamically detecting timing-errors have been proposed, but none of them have focused on register writes. In this paper, we propose a technique for detecting and recovering from timing errors during register writes. We introduce a verifying technique that uses additional buffer (called the write assurance buffer (WAB)) which is provided with a sufficient timing margin. The evaluation results reveal a performance degradation of 4.5\% using an 8-entry WAB; this value becomes negligible when a 16-entry WAB is used.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Malita:2007:MMC, author = "Mihaela Mali{\c{t}}a and Gheorghe {\c{S}}tefan and Dominique Thi{\'e}baut", title = "Not multi-, but many-core: designing integral parallel architectures for embedded computation", journal = j-COMP-ARCH-NEWS, volume = "35", number = "5", pages = "32--38", month = dec, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1360464.1360474", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:13 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Recent embedded systems have switched to fully programmable parallel architectures. To make sure all corner cases usually present in real applications are supported and efficiently implemented in this switch of implementation, new solutions must be found. We introduce the integral parallel architecture (IPA) as a solution supporting intensive data computation in System-on-a-chip (Soc) implementations, fitting in a small area, and requiring low power. An IPA supports naturally all three possible styles of parallelism: data, time, and speculative.\par As an illustrative example, we present the BA1024 chip, a fully programmable SoC designed by BrightScale, Inc. for HDTV codecs. Its main performance figures include 60 GOPS/Watt and 2 GOPS/mm$^2$, representing an efficient IPA approach for embedded computation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "embedded systems; parallel architectures; programmable systems; video processing", } @Article{Miyoshi:2007:FGC, author = "Takefumi Miyoshi and Nobuhiko Sugino", title = "Fine-grain compensation method with consideration of trade-offs between computation and data transfer for power consumption", journal = j-COMP-ARCH-NEWS, volume = "35", number = "5", pages = "39--44", month = dec, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1360464.1360475", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:13 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Fine-grain parallelizing method with consideration of the number of data transfers for low power consumption is proposed. In the proposed method, power consumption by data transfers between processor elements in a multiprocessor is focused on, and the number of data transfers is reduced.\par In this paper, a measure based on the relationship between variables in a given program is defined to evaluate the number of data transfers, firstly. And then a proposed compensation method by use of the evaluation of power consumption based on the measure is explained. Finally, the result of applying proposed compensation method implemented on COINS framework to several example programs is shown.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Romanescu:2007:VSC, author = "Bogdan F. Romanescu and Michael E. Bauer and Sule Ozev and Daniel J. Sorin", title = "{VariaSim}: simulating circuits and systems in the presence of process variability", journal = j-COMP-ARCH-NEWS, volume = "35", number = "5", pages = "45--48", month = dec, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1360464.1360465", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:13 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper, we present VariaSim, the publicly available Static Statistical Timing Analysis (SSTA) Tool from Duke University. VariaSim enables researchers to analyze the impact of CMOS process variability on the behavior of circuits and systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Venkateswaran:2007:FGSa, author = "N. Venkateswaran and Deepak Srinivasan and Madhavan Manivannan and T. P. Ramnath Sai Sagar and Shyamsundar Gopalakrishnan and VinothKrishnan Elangovan and Karthik Chandrasekar and Prem Kumar Ramesh and Viswanath Venkatesan and Arvindakshan Babu and Sudharshan", title = "Future generation supercomputers {I}: a paradigm for node architecture", journal = j-COMP-ARCH-NEWS, volume = "35", number = "5", pages = "49--60", month = dec, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1360464.1360466", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:13 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As a result of the increasing requirements of present and future computation intensive applications, there have been many fundamentally divergent approaches such as the Blue-Gene, TRIPS, HERO, Cascade spurred in order to provide increased performance at node level in supercomputing clusters. The design of the node architecture should be such that 'Cost-Effective Supercomputing' is realized without compromising on the requirements of the ever-performance hungry grand challenge applications. However, to increase performance at the cluster level, scalability and likewise tackling the mapping complexity across the large cluster of nodes becomes critical. The potential of such a node architecture can be fully exploited only with an appropriate cluster architecture. In an attempt to address these issues for efficient and Cost-Effective Supercomputing, we propose a novel paradigm for designing High Performance Clusters, in two papers. In paper-II, we discuss the design of operating system and cluster architecture. In this paper, we present a node architecture model based on the Memory In Processor paradigm and discuss the related architectural aspects (ISA, compiler, network interconnection etc). We provide a design space based on the proposed model for which a simulator is developed, with the help of which the performance of such a node architecture is outlined.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Venkateswaran:2007:FGSb, author = "N. Venkateswaran and Deepak Srinivasan and Madhavan Manivannan and T. P. Ramnath Sai Sagar and Shyamsundar Gopalakrishnan and VinothKrishnan Elangovan and Arvind M. and Prem Kumar Ramesh and Karthik Ganesan and Viswanath Krishnamurthy and Sivaramakrishnan", title = "Future generation supercomputers {II}: a paradigm for cluster architecture", journal = j-COMP-ARCH-NEWS, volume = "35", number = "5", pages = "61--70", month = dec, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1360464.1360467", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:13 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In part-I, a novel multi-core node architecture was proposed which when employed in a cluster environment would be capable of tackling computational complexity associated with wide class of applications. Furthermore, it was discussed that by appropriately scaling the architectural specifications, Teraops computing power could be achieved at the node level. In order to harness the computational power of such a node, we have developed an efficient application execution model with a competent cluster architectural backbone. In this paper we present the novel cluster paradigm, dealing with operating system design, parallel programming model and cluster interconnection network. Our approach in developing the competent cluster design revolves around an execution model to aid the execution of multiple applications simultaneously on all partitions of the cluster, leading to cost sharing across applications. This would be a major initiative towards achieving Cost-Effective Supercomputing.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2007:INd, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "35", number = "5", pages = "71--73", month = dec, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1360464.1360477", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:13 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This column consists of selected traffic from the comp.arch newsgroup, a forum for discussion of computer architecture on the Internet---an international computer network.\par As always, the opinions expressed in this column are the personal views of the authors, and do not necessarily represent the institutions to which they are affiliated.\par Text which sets the context of a message appears underlined or in italics; this is usually text the author has quoted from earlier messages. The code-like expressions below the authors' names are their addresses on Internet.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Winfree:2008:TMP, author = "Erik Winfree", title = "Toward molecular programming with {DNA}", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "1--1", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353534.1346282", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Biological organisms are beautiful examples of programming. The program and data are stored in biological molecules such as DNA, RNA, and proteins; the algorithms are carried out by molecular and biochemical processes; and the end result is the creation and function of an organism. If we understood how to program molecular systems, what could we create? Lifelike technologies whose basic operations are chemical reactions? The fields of chemistry, physics, biology, and computer science are converging as we begin to synthesize molecules, molecular machines, and molecular systems of ever increasing complexity, leading to subdisciplines such as DNA nanotechnology, DNA computing, and synthetic biology. Having demonstrated simple devices and systems -- self-assembled structures, molecular motors, chemical logic gates -- researchers are now turning to the question of how to create large-scale integrated systems. To do so, we must learn how to manage complexity: how to efficiently specify the structure and behavior of intricate molecular systems, how to compile such specifications down to the design of molecules to be synthesized in the lab, and how to ensure that such systems function robustly. These issues will be illustrated for chemical logic circuits based on cascades of DNA hybridization reactions.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "DNA; molecular programming", } @Article{Chen:2008:OVB, author = "Xiaoxin Chen and Tal Garfinkel and E. Christopher Lewis and Pratap Subrahmanyam and Carl A. Waldspurger and Dan Boneh and Jeffrey Dwoskin and Dan R. K. Ports", title = "{Overshadow}: a virtualization-based approach to retrofitting protection in commodity operating systems", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "2--13", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346284", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Commodity operating systems entrusted with securing sensitive data are remarkably large and complex, and consequently, frequently prone to compromise. To address this limitation, we introduce a virtual-machine-based system called Overshadow that protects the privacy and integrity of application data, even in the event of a total OS compromise. Overshadow presents an application with a normal view of its resources, but the OS with an encrypted view. This allows the operating system to carry out the complex task of managing an application's resources, without allowing it to read or modify them. Thus, Overshadow offers a last line of defense for application data.\par Overshadow builds on multi-shadowing, a novel mechanism that presents different views of 'physical' memory, depending on the context performing the access. This primitive offers an additional dimension of protection beyond the hierarchical protection domains implemented by traditional operating systems and processor architectures.\par We present the design and implementation of Overshadow and show how its new protection semantics can be integrated with existing systems. Our design has been fully implemented and used to protect a wide range of unmodified legacy applications running on an unmodified Linux operating system. We evaluate the performance of our implementation, demonstrating that this approach is practical.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cloaking; hypervisors; memory protection; multi-shadowing; operating systems; virtual machine monitors; VMM", } @Article{McCune:2008:HLC, author = "Jonathan M. McCune and Bryan Parno and Adrian Perrig and Michael K. Reiter and Arvind Seshadri", title = "How low can you go?: recommendations for hardware-supported minimal {TCB} code execution", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "14--25", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346285", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We explore the extent to which newly available CPU-based security technology can reduce the Trusted Computing Base (TCB) for security-sensitive applications. We find that although this new technology represents a step in the right direction, significant performance issues remain. We offer several suggestions that leverage existing processor technology, retain security, and improve performance. Implementing these recommendations will finally allow application developers to focus exclusively on the security of their own code, enabling it to execute in isolation from the numerous vulnerabilities in the underlying layers of legacy code.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "late launch; secure execution; trusted computing", } @Article{Bhargava:2008:ATD, author = "Ravi Bhargava and Benjamin Serebrin and Francesco Spadini and Srilatha Manne", title = "Accelerating two-dimensional page walks for virtualized systems", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "26--35", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346286", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Nested paging is a hardware solution for alleviating the software memory management overhead imposed by system virtualization. Nested paging complements existing page walk hardware to form a two-dimensional (2D) page walk, which reduces the need for hypervisor intervention in guest page table management. However, the extra dimension also increases the maximum number of architecturally-required page table references.\par This paper presents an in-depth examination of the 2D page table walk overhead and options for decreasing it. These options include using the AMD Opteron processor's page walk cache to exploit the strong reuse of page entry references. For a mix of server and SPEC benchmarks, the presented results show a 15\%-38\% improvement in guest performance by extending the existing page walk cache to also store the nested dimension of the 2D page walk. Caching nested page table translations and skipping multiple page entry references produce an additional 3\%-7\% improvement.\par Much of the remaining 2D page walk overhead is due to low-locality nested page entry references, which result in additional memory hierarchy misses. By using large pages, the hypervisor can eliminate many of these long-latency accesses and further improve the guest performance by 3\%-22\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "AMD; hypervisor; memory management; nested paging; page walk caching; TLB; virtual machine monitor; virtualization", } @Article{Lee:2008:ETL, author = "Benjamin C. Lee and David Brooks", title = "Efficiency trends and limits from comprehensive microarchitectural adaptivity", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "36--47", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353534.1346288", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Increasing demand for power-efficient, high-performance computing requires tuning applications and/or the underlying hardware to improve the mapping between workload heterogeneity and computational resources. To assess the potential benefits of hardware tuning, we propose a framework that leverages synergistic interactions between recent advances in (a) sampling, (b) predictive modeling, and (c) optimization heuristics. This framework enables qualitatively new capabilities in analyzing the performance and power characteristics of adaptive microarchitectures. For the first time, we are able to simultaneously consider high temporal and comprehensive spatial adaptivity. In particular, we optimize efficiency for many, short adaptive intervals and identify the best configuration of 15 parameters, which define a space of 240B point.\par With frequent sub-application reconfiguration and a fully reconfigurable hardware substrate, adaptive microarchitectures achieve bips$^3$ /w efficiency gains of up to 5.3x (median 2.4x) relative to their static counterparts already optimized for a given application. This 5.3x efficiency gain is derived from a 1.6x performance gain and 0.8x power reduction. Although several applications achieve a significant fraction of their potential efficiency with as few as three adaptive parameters, the three most significant parameters differ across applications. These differences motivate a hardware substrate capable of comprehensive adaptivity to meet these diverse application requirements.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "adaptivity; efficiency; inference; microarchitecture; performance; power; reconfigurablity; regression; simulation; statistics", } @Article{Raghavendra:2008:NPS, author = "Ramya Raghavendra and Parthasarathy Ranganathan and Vanish Talwar and Zhikui Wang and Xiaoyun Zhu", title = "No 'power' struggles: coordinated multi-level power management for the data center", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "48--59", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353534.1346289", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Power delivery, electricity consumption, and heat management are becoming key challenges in data center environments. Several past solutions have individually evaluated different techniques to address separate aspects of this problem, in hardware and software, and at local and global levels. Unfortunately, there has been no corresponding work on coordinating all these solutions. In the absence of such coordination, these solutions are likely to interfere with one another, in unpredictable (and potentially dangerous) ways. This paper seeks to address this problem. We make two key contributions. First, we propose and validate a power management solution that coordinates different individual approaches. Using simulations based on 180 server traces from nine different real-world enterprises, we demonstrate the correctness, stability, and efficiency advantages of our solution. Second, using our unified architecture as the base, we perform a detailed quantitative sensitivity analysis and draw conclusions about the impact of different architectures, implementations, workloads, and system design choices.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "capping; control theory; coordination; data center; efficiency; power management; virtualization", } @Article{Ballapuram:2008:EAS, author = "Chinnakrishnan S. Ballapuram and Ahmad Sharif and Hsien-Hsin S. Lee", title = "Exploiting access semantics and program behavior to reduce snoop power in chip multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "60--69", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346290", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Integrating more processor cores on-die has become the unanimous trend in the microprocessor industry. Most of the current research thrusts using chip multiprocessors (CMPs) as the baseline to analyze problems in various domains. One of the main design issues facing CMP systems is the growing number of snoops required to maintain cache coherency and to support self/cross-modifying code that leads to power and performance limitations. In this paper, we analyze the internal and external snoop behavior in a CMP system and relax the snoopy cache coherence protocol based on the program semantics and properties of the shared variables for saving power. Based on the observations and analyses, we propose two novel techniques: Selective Snoop Probe (SSP) and Essential Snoop Probe (ESP) to reduce power without compromising performance. Our simulation results show that using the SSP technique, 5\% to 65\% data cache energy savings per core for different processor configurations can be achieved with 1\% to 2\% performance improvement. We also show that 5\% to 82\% of data cache energy per core is spent on the non-essential snoop probes that can be saved using the ESP technique.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "chip multiprocessors; internal and external snoops; MESI protocol; self-modifying code", } @Article{Mallik:2008:PMU, author = "Arindam Mallik and Jack Cosgrove and Robert P. Dick and Gokhan Memik and Peter Dinda", title = "{PICSEL}: measuring user-perceived performance to control dynamic frequency scaling", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "70--79", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353534.1346291", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The ultimate goal of a computer system is to satisfy its users. The success of architectural or system-level optimizations depends largely on having accurate metrics for user satisfaction. We propose to derive such metrics from information that is 'close to flesh' and apparent to the user rather than from information that is 'close to metal' and hidden from the user. We describe and evaluate PICSEL, a dynamic voltage and frequency scaling (DVFS) technique that uses measurements of variations in the rate of change of a computer's video output to estimate user-perceived performance. Our adaptive algorithms, one conservative and one aggressive, use these estimates to dramatically reduce operating frequencies and voltages for graphically-intensive applications while maintaining performance at a satisfactory level for the user. We evaluate PICSEL through user studies conducted on a Pentium M laptop running Windows XP. Experiments performed with 20 users executing three applications indicate that the measured laptop power can be reduced by up to 12.1\%, averaged across all of our users and applications, compared to the default Windows XP DVFS policy. User studies revealed that the difference in overall user satisfaction between the more aggressive version of PICSEL and Windows DVFS were statistically insignificant, whereas the conservative version of PICSEL actually improved user satisfaction when compared to Windows DVFS.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "dynamic voltage and frequency scaling; power management; thermal emergency; user-perceived performance", } @Article{Joao:2008:IPO, author = "Jose A. Joao and Onur Mutlu and Hyesoon Kim and Rishi Agarwal and Yale N. Patt", title = "Improving the performance of object-oriented languages with dynamic predication of indirect jumps", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "80--90", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353535.1346293", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Indirect jump instructions are used to implement increasingly-common programming constructs such as virtual function calls, switch-case statements, jump tables, and interface calls. The performance impact of indirect jumps is likely to increase because indirect jumps with multiple targets are difficult to predict even with specialized hardware.\par This paper proposes a new way of handling hard-to-predict indirect jumps: dynamically predicating them. The compiler (static or dynamic) identifies indirect jumps that are suitable for predication along with their control-flow merge (CFM) points. The hardware predicates the instructions between different targets of the jump and its CFM point if the jump turns out to be hard-to-predict at run time. If the jump would actually have been mispredicted, its dynamic predication eliminates a pipeline flush, thereby improving performance.\par Our evaluations show that Dynamic Indirect jump Predication (DIP) improves the performance of a set of object-oriented applications including the Java DaCapo benchmark suite by 37.8\% compared to a commonly-used branch target buffer based predictor, while also reducing energy consumption by 24.8\%. We compare DIP to three previously proposed indirect jump predictors and find that it provides the best performance and energy-efficiency.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "dynamic predication; indirect jumps; object-oriented languages; predicated execution; virtual functions", } @Article{Wegiel:2008:MCV, author = "Michal Wegiel and Chandra Krintz", title = "The mapping collector: virtual memory support for generational, parallel, and concurrent compaction", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "91--102", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353535.1346294", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Parallel and concurrent garbage collectors are increasingly employed by managed runtime environments (MREs) to maintain scalability, as multi-core architectures and multi-threaded applications become pervasive. Moreover, state-of-the-art MREs commonly implement compaction to eliminate heap fragmentation and enable fast linear object allocation.\par Our empirical analysis of object demographics reveals that unreachable objects in the heap tend to form clusters large enough to be effectively managed at the granularity of virtual memory pages. Even though processes can manipulate the mapping of the virtual address space through the standard operating system (OS) interface on most platforms, extant parallel/concurrent compactors do not do so to exploit this clustering behavior and instead achieve compaction by performing, relatively expensive, object moving and pointer adjustment.\par We introduce the Mapping Collector (MC), which leverages virtual memory operations to reclaim and consolidate free space without moving objects and updating pointers. MC is a nearly-single-phase compactor that is simpler and more efficient than previously reported compactors that comprise two to four phases. Through effective MRE-OS coordination, MC maintains the simplicity of a non-moving collector while providing efficient parallel and concurrent compaction.\par We implement both stop-the-world and concurrent MC in a generational garbage collection framework within the open-source HotSpot Java Virtual Machine. Our experimental evaluation using a multiprocessor indicates that MC significantly increases throughput and scalability as well as reduces pause times, relative to state-of-the-art, parallel and concurrent compactors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "compaction; concurrent; parallel; virtual memory", } @Article{Devietti:2008:HAS, author = "Joe Devietti and Colin Blundell and Milo M. K. Martin and Steve Zdancewic", title = "{Hardbound}: architectural support for spatial safety of the {C} programming language", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "103--114", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346295", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The C programming language is at least as well known for its absence of spatial memory safety guarantees (i.e., lack of bounds checking) as it is for its high performance. C's unchecked pointer arithmetic and array indexing allow simple programming mistakes to lead to erroneous executions, silent data corruption, and security vulnerabilities. Many prior proposals have tackled enforcing spatial safety in C programs by checking pointer and array accesses. However, existing software-only proposals have significant drawbacks that may prevent wide adoption, including: unacceptably high run-time overheads, lack of completeness, incompatible pointer representations, or need for non-trivial changes to existing C source code and compiler infrastructure.\par Inspired by the promise of these software-only approaches, this paper proposes a hardware bounded pointer architectural primitive that supports cooperative hardware/software enforcement of spatial memory safety for C programs. This bounded pointer is a new hardware primitive datatype for pointers that leaves the standard C pointer representation intact, but augments it with bounds information maintained separately and invisibly by the hardware. The bounds are initialized by the software, and they are then propagated and enforced transparently by the hardware, which automatically checks a pointer's bounds before it is dereferenced. One mode of use requires instrumenting only malloc, which enables enforcement of per-allocation spatial safety for heap-allocated objects for existing binaries. When combined with simple intraprocedural compiler instrumentation, hardware bounded pointers enable a low-overhead approach for enforcing complete spatial memory safety in unmodified C programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "C programming language; spatial memory safety", } @Article{Lvin:2008:ATA, author = "Vitaliy B. Lvin and Gene Novark and Emery D. Berger and Benjamin G. Zorn", title = "{Archipelago}: trading address space for reliability and security", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "115--124", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353535.1346296", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Memory errors are a notorious source of security vulnerabilities that can lead to service interruptions, information leakage and unauthorized access. Because such errors are also difficult to debug, the absence of timely patches can leave users vulnerable to attack for long periods of time. A variety of approaches have been introduced to combat these errors, but these often incur large runtime overheads and generally abort on errors, threatening availability.\par This paper presents Archipelago, a runtime system that takes advantage of available address space to substantially reduce the likelihood that a memory error will affect program execution. Archipelago randomly allocates heap objects far apart in virtual address space, effectively isolating each object from buffer overflows. Archipelago also protects against dangling pointer errors by preserving the contents of freed objects after they are freed. Archipelago thus trades virtual address space---a plentiful resource on 64-bit systems---for significantly improved program reliability and security, while limiting physical memory consumption by tracking the working set of an application and compacting cold objects. We show that Archipelago allows applications to continue to run correctly in the face of thousands of memory errors. Across a suite of server applications, Archipelago's performance overhead is 6\% on average (between -7\% and 22\%), making it especially suitable to protect servers that have known security vulnerabilities due to heap memory errors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "Archipelago; buffer overflow; dynamic memory allocation; memory errors; probabilistic memory safety; randomized algorithms; virtual memory", } @Article{Choi:2008:ABP, author = "Bumyong Choi and Leo Porter and Dean M. Tullsen", title = "Accurate branch prediction for short threads", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "125--134", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353534.1346298", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Multi-core processors, with low communication costs and high availability of execution cores, will increase the use of execution and compilation models that use short threads to expose parallelism. Current branch predictors seek to incorporate large amounts of control flow history to maximize accuracy. However, when that history is absent the predictor fails to work as intended. Thus, modern predictors are almost useless for threads below a certain length.\par Using a Speculative Multithreaded (SpMT) architecture as an example of a system which generates shorter threads, this work examines techniques to improve branch prediction accuracy when a new thread begins to execute on a different core. This paper proposes a minor change to the branch predictor that gives virtually the same performance on short threads as an idealized predictor that incorporates unknowable pre-history of a spawned speculative thread. At the same time, strong performance on long threads is preserved. The proposed technique sets the global history register of the spawned thread to the initial value of the program counter. This novel and simple design reduces branch mispredicts by 29\% and provides as much as a 13\% IPC improvement on selected SPEC2000 benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "branch prediction; chip multiprocessors", } @Article{Srikantaiah:2008:ASP, author = "Shekhar Srikantaiah and Mahmut Kandemir and Mary Jane Irwin", title = "Adaptive set pinning: managing shared caches in chip multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "135--144", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353534.1346299", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As part of the trend towards Chip Multiprocessors (CMPs) for the next leap in computing performance, many architectures have explored sharing the last level of cache among different processors for better performance-cost ratio and improved resource allocation. Shared cache management is a crucial CMP design aspect for the performance of the system. This paper first presents a new classification of cache misses - CII: Compulsory, Inter-processor and Intra-processor misses - for CMPs with shared caches to provide a better understanding of the interactions between memory transactions of different processors at the level of shared cache in a CMP. We then propose a novel approach, called set pinning, for eliminating inter-processor misses and reducing intra-processor misses in a shared cache. Furthermore, we show that an adaptive set pinning scheme improves over the benefits obtained by the set pinning scheme by significantly reducing the number of off-chip accesses. Extensive analysis of these approaches with SPEComp 2001 benchmarks is performed using a full system simulator. Our experiments indicate that the set pinning scheme achieves an average improvement of 22.18\% in the L2 miss rate while the adaptive set pinning scheme reduces the miss rates by an average of 47.94\% as compared to the traditional shared cache scheme. They also improve the performance by 7.24\% and 17.88\% respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "CMP; inter-processor; intra-processor; set pinning; shared cache", } @Article{Tuck:2008:SSE, author = "James Tuck and Wonsun Ahn and Luis Ceze and Josep Torrellas", title = "{SoftSig}: software-exposed hardware signatures for code analysis and optimization", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "145--156", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346300", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Many code analysis techniques for optimization, debugging, or parallelization need to perform runtime disambiguation of sets of addresses. Such operations can be supported efficiently and with low complexity with hardware signatures.\par To enable flexible use of signatures, this paper proposes to expose a Signature Register File to the software through a rich ISA. The software has great flexibility to decide, for each signature,which addresses to collect and which addresses to disambiguate against. We call this architecture SoftSig. In addition, as an example of SoftSig use, we show how to detect redundant function calls efficiently and eliminate them dynamically. We call this algorithm MemoiSE. On average for five popular applications, MemoiSE reduces the number of dynamic instructions by 9.3\%, thereby reducing the execution time of the applications by 9\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "memory disambiguation; multi-core architectures; runtime optimization", } @Article{Burcea:2008:PV, author = "Ioana Burcea and Stephen Somogyi and Andreas Moshovos and Babak Falsafi", title = "Predictor virtualization", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "157--167", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346301", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Many hardware optimizations rely on collecting information about program behavior at runtime. This information is stored in lookup tables. To be accurate and effective, these optimizations usually require large dedicated on-chip tables. Although technology advances offer an increased amount of on-chip resources, these resources are allocated to increase the size of on-chip conventional cache hierarchies.\par This work proposes Predictor Virtualization, a technique that uses the existing memory hierarchy to emulate large predictor tables. We demonstrate the benefits of this technique by virtualizing a state-of-the-art data prefetcher. Full-system, cycle-accurate simulations demonstrate that the virtualized prefetcher preserves the performance benefits of the original design, while reducing the on-chip storage dedicated to the predictor table from 60KB down to less than one kilobyte.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "caches; memory hierarchy; metadata; predictor virtualization", } @Article{Ganapathy:2008:DIM, author = "Vinod Ganapathy and Matthew J. Renzelmann and Arini Balakrishnan and Michael M. Swift and Somesh Jha", title = "The design and implementation of microdrivers", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "168--178", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346303", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Device drivers commonly execute in the kernel to achieve high performance and easy access to kernel services. However, this comes at the price of decreased reliability and increased programming difficulty. Driver programmers are unable to use user-mode development tools and must instead use cumbersome kernel tools. Faults in kernel drivers can cause the entire operating system to crash. User-mode drivers have long been seen as a solution to this problem, but suffer from either poor performance or new interfaces that require a rewrite of existing drivers.\par This paper introduces the Microdrivers architecture that achieves high performance and compatibility by leaving critical path code in the kernel and moving the rest of the driver code to a user-mode process. This allows data-handling operations critical to I/O performance to run at full speed, while management operations such as initialization and configuration run at reduced speed in user-level. To achieve compatibility, we present DriverSlicer, a tool that splits existing kernel drivers into a kernel-level component and a user-level component using a small number of programmer annotations. Experiments show that as much as 65\% of driver code can be removed from the kernel without affecting common-case performance, and that only 1-6 percent of the code requires annotations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "device drivers; program partitioning; reliability", } @Article{Weinsberg:2008:TFC, author = "Yaron Weinsberg and Danny Dolev and Tal Anker and Muli Ben-Yehuda and Pete Wyckoff", title = "Tapping into the fountain of {CPUs}: on operating system support for programmable devices", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "179--188", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346304", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The constant race for faster and more powerful CPUs is drawing to a close. No longer is it feasible to significantly increase the speed of the CPU without paying a crushing penalty in power consumption and production costs. Instead of increasing single thread performance, the industry is turning to multiple CPU threads or cores (such as SMT and CMP) and heterogeneous CPU architectures (such as the Cell Broadband Engine). While this is a step in the right direction, in every modern PC there is a wealth of untapped compute resources. The NIC has a CPU; the disk controller is programmable; some high-end graphics adapters are already more powerful than host CPUs. Some of these CPUs can perform some functions more efficiently than the host CPUs. Our operating systems and programming abstractions should be expanded to let applications tap into these computational resources and make the best use of them.\par Therefore, we propose the H\par YDRA framework, which lets application developers use the combined power of every compute resource in a coherent way. HYDRA is a programming model and a runtime support layer which enables utilization of host processors as well as various programmable peripheral devices' processors. We present the framework and its application for a demonstrative use-case, as well as provide a thorough evaluation of its capabilities. Using HYDRA we were able to cut down the development cost of a system that uses multiple heterogeneous compute resources significantly.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "offloading; operating systems; programming model", } @Article{Shen:2008:HCD, author = "Kai Shen and Ming Zhong and Sandhya Dwarkadas and Chuanpeng Li and Christopher Stewart and Xiao Zhang", title = "Hardware counter driven on-the-fly request signatures", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "189--200", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346306", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Today's processors provide a rich source of statistical information on application execution through hardware counters. In this paper, we explore the utilization of these statistics as request signatures in server applications for identifying requests and inferring high-level request properties ( e.g., CPU and I/O resource needs). Our key finding is that effective request signatures may be constructed using a small amount of hardware statistics while the request is still in an early stage of its execution. Such on-the-fly request identification and property inference allow guided operating system adaptation at request granularity ( e.g., resource-aware request scheduling and on-the-fly request classification). We address the challenges of selecting hardware counter metrics for signature construction and providing necessary operating system support for per-request statistics management. Our implementation in the Linux 2.6.10 kernel suggests that our approach requires low overhead suitable for runtime deployment. Our on-the-fly request resource consumption inference (averaging 7\%, 3\%, 20\%, and 41\% prediction errors for four server workloads, TPC-C, TPC-H, J2EE-based RUBiS, and a trace-driven index search, respectively) is much more accurate than the online running-average based prediction (73-82\% errors). Its use for resource-aware request scheduling results in a 15-70\% response time reduction for three CPU-bound applications. Its use for on-the-fly request classification and anomaly detection exhibits high accuracy for the TPC-H workload with synthetically generated anomalous requests following a typical SQL-injection attack pattern.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "anomaly detection; hardware counter; operating system adaptation; request classification; server system", } @Article{VanErtvelde:2008:DPA, author = "Luk {Van Ertvelde} and Lieven Eeckhout", title = "Dispersing proprietary applications as benchmarks through code mutation", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "201--210", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353534.1346307", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Industry vendors hesitate to disseminate proprietary applications to academia and third party vendors. By consequence, the benchmarking process is typically driven by standardized, open-source benchmarks which may be very different from and likely not representative of the real-life applications of interest.\par This paper proposes code mutation, a novel technique that mutates a proprietary application to complicate reverse engineering so that it can be distributed as a benchmark. The benchmark mutant then serves as a proxy for the proprietary application. The key idea in the proposed code mutation approach is to preserve the proprietary application's dynamic memory access and/or control flow behavior in the benchmark mutant while mutating the rest of the application code. To this end, we compute program slices for memory access operations and/or control flow operations trimmed through constant value and branch profiles; and subsequently mutate the instructions not appearing in these slices through binary rewriting.\par Our experimental results using SPEC CPU2000 and MiBench benchmarks show that code mutation is a promising technique that mutates up to 90\% of the static binary, up to 50\% of the dynamically executed instructions, and up to 35\% of the at run time exposed inter-operation data dependencies. The performance characteristics of the mutant are very similar to those of the proprietary application across a wide range of microarchitectures and hardware implementations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "benchmark generation; code mutation", } @Article{Mysore:2008:UVF, author = "Shashidhar Mysore and Bita Mazloom and Banit Agrawal and Timothy Sherwood", title = "Understanding and visualizing full systems with data flow tomography", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "211--221", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353534.1346308", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "It is not uncommon for modern systems to be composed of a variety of interacting services, running across multiple machines in such a way that most developers do not really understand the whole system. As abstraction is layered atop abstraction, developers gain the ability to compose systems of extraordinary complexity with relative ease. However, many software properties, especially those that cut across abstraction layers, become very difficult to understand in such compositions. The communication patterns involved, the privacy of critical data, and the provenance of information, can be difficult to find and understand, even with access to all of the source code. The goal of Data Flow Tomography is to use the inherent information flow of such systems to help visualize the interactions between complex and interwoven components across multiple layers of abstraction. In the same way that the injection of short-lived radioactive isotopes help doctors trace problems in the cardiovascular system, the use of 'data tagging' can help developers slice through the extraneous layers of software and pin-point those portions of the system interacting with the data of interest. To demonstrate the feasibility of this approach we have developed a prototype system in which tags are tracked both through the machine and in between machines over the network, and from which novel visualizations of the whole system can be derived. We describe the system-level challenges in creating a working system tomography tool and we qualitatively evaluate our system by examining several example real world scenarios.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "data flow tracking; tomography; virtual machine", } @Article{Ottoni:2008:COG, author = "Guilherme Ottoni and David I. August", title = "Communication optimizations for global multi-threaded instruction scheduling", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "222--232", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353535.1346310", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The recent shift in the industry towards chip multiprocessor (CMP) designs has brought the need for multi-threaded applications to mainstream computing. As observed in several limit studies, most of the parallelization opportunities require looking for parallelism beyond local regions of code. To exploit these opportunities, especially for sequential applications, researchers have recently proposed global multi-threaded instruction scheduling techniques, including DSWP and GREMIO. These techniques simultaneously schedule instructions from large regions of code, such as arbitrary loop nests or whole procedures, and have been shown to be effective at extracting threads for many applications. A key enabler of these global instruction scheduling techniques is the Multi-Threaded Code Generation (MTCG) algorithm proposed in [16], which generates multi-threaded code for any partition of the instructions into threads. This algorithm inserts communication and synchronization instructions in order to satisfy all inter-thread dependences.\par In this paper, we present a general compiler framework, COCO, to optimize the communication and synchronization instructions inserted by the MTCG algorithm. This framework, based on thread-aware data-flow analyses and graph min-cut algorithms, appropriately models and optimizes all kinds of inter-thread dependences, including register, memory, and control dependences. Our experiments, using a fully automatic compiler implementation of these techniques, demonstrate significant reductions (about 30\% on average) in the number of dynamic communication instructions in code parallelized with DSWP and GREMIO. This reduction in communication translates to performance gains of up to 40\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "communication; data-flow analysis; graph min-cut; instruction scheduling; multi-threading; synchronization", } @Article{Kulkarni:2008:OPB, author = "Milind Kulkarni and Keshav Pingali and Ganesh Ramanarayanan and Bruce Walter and Kavita Bala and L. Paul Chew", title = "Optimistic parallelism benefits from data partitioning", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "233--243", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353534.1346311", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Recent studies of irregular applications such as finite-element mesh generators and data-clustering codes have shown that these applications have a generalized data parallelism arising from the use of iterative algorithms that perform computations on elements of worklists. In some irregular applications, the computations on different elements are independent. In other applications, there may be complex patterns of dependences between these computations.\par The Galois system was designed to exploit this kind of irregular data parallelism on multicore processors. Its main features are (i) two kinds of set iterators for expressing worklist-based data parallelism, and (ii) a runtime system that performs optimistic parallelization of these iterators, detecting conflicts and rolling back computations as needed. Detection of conflicts and rolling back iterations requires information from class implementors.\par In this paper, we introduce mechanisms to improve the execution efficiency of Galois programs: data partitioning, data-centric work assignment, lock coarsening, and over-decomposition. These mechanisms can be used to exploit locality of reference, reduce mis-speculation, and lower synchronization overhead. We also argue that the design of the Galois system permits these mechanisms to be used with relatively little modification to the user code. Finally, we present experimental results that demonstrate the utility of these mechanisms.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "data partitioning; irregular programs; locality; lock coarsening; optimistic parallelism; over-decomposition", } @Article{Cox:2008:XEO, author = "Russ Cox and Tom Bergan and Austin T. Clements and Frans Kaashoek and Eddie Kohler", title = "{Xoc}, an extension-oriented compiler for systems programming", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "244--254", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353535.1346312", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Today's system programmers go to great lengths to extend the languages in which they program. For instance, system-specific compilers find errors in Linux and other systems, and add support for specialized control flow to Qt and event-based programs. These compilers are difficult to build and cannot always understand each other's language changes. However, they can greatly improve code understandability and correctness, advantages that should be accessible to all programmers.\par We describe an extension-oriented compiler for C called xoc. An extension-oriented compiler, unlike a conventional extensible compiler, implements new features via many small extensions that are loaded together as needed. Xoc gives extension writers full control over program syntax and semantics while hiding many compiler internals. Xoc programmers concisely define powerful compiler extensions that, by construction, can be combined; even some parts of the base compiler, such as GNU C compatibility, are structured as extensions.\par Xoc is based on two key interfaces. Syntax patterns allow extension writers to manipulate language fragments using concrete syntax. Lazy computation of attributes allows extension writers to use the results of analyses by other extensions or the core without needing to worry about pass scheduling.\par Extensions built using xoc include xsparse, a 345-line extension that mimics Sparse, Linux's C front end, and xlambda, a 170-line extension that adds function expressions to C. An evaluation of xoc using these and 13 other extensions shows that xoc extensions are typically more concise than equivalent extensions written for conventional extensible compilers and that it is possible to compose extensions.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "extension-oriented compilers", } @Article{Wells:2008:AIF, author = "Philip M. Wells and Koushik Chakraborty and Gurindar S. Sohi", title = "Adapting to intermittent faults in multicore systems", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "255--264", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353536.1346314", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Future multicore processors will be more susceptible to a variety of hardware failures. In particular, intermittent faults, caused in part by manufacturing, thermal, and voltage variations, can cause bursts of frequent faults that last from several cycles to several seconds or more. Due to practical limitations of circuit techniques, cost-effective reliability will likely require the ability to temporarily suspend execution on a core during periods of intermittent faults.\par We investigate three of the most obvious techniques for adapting to the dynamically changing resource availability caused by intermittent faults, and demonstrate their different system-level implications. We show that system software reconfiguration has very high overhead, that temporarily pausing execution on a faulty core can lead to cascading livelock, and that using spare cores has high fault-free cost. To remedy these and other drawbacks of the three baseline techniques, we propose using a thin hardware/firmware layer to manage an overcommitted system -- one where the OS is configured to use more virtual processors than the number of currently available physical cores. We show that this proposed technique can gracefully degrade performance during intermittent faults of various duration with low overhead, without involving system software, and without requiring spare cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "intermittent faults; overcommitted system", } @Article{Li:2008:UPH, author = "Man-Lap Li and Pradeep Ramachandran and Swarup Kumar Sahoo and Sarita V. Adve and Vikram S. Adve and Yuanyuan Zhou", title = "Understanding the propagation of hard errors to software and implications for resilient system design", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "265--276", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346315", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With continued CMOS scaling, future shipped hardware will be increasingly vulnerable to in-the-field faults. To be broadly deployable, the hardware reliability solution must incur low overheads, precluding use of expensive redundancy. We explore a cooperative hardware-software solution that watches for anomalous software behavior to indicate the presence of hardware faults. Fundamental to such a solution is a characterization of how hardware faults indifferent microarchitectural structures of a modern processor propagate through the application and OS.\par This paper aims to provide such a characterization, resulting in identifying low-cost detection methods and providing guidelines for implementation of the recovery and diagnosis components of such a reliability solution. We focus on hard faults because they are increasingly important and have different system implications than the much studied transients. We achieve our goals through fault injection experiments with a microarchitecture-level full system timing simulator. Our main results are: (1) we are able to detect 95\% of the unmasked faults in 7 out of 8 studied microarchitectural structures with simple detectors that incur zero to little hardware overhead; (2) over 86\% of these detections are within latencies that existing hardware checkpointing schemes can handle, while others require software checkpointing; and (3) a surprisingly large fraction of the detected faults corrupt OS state, but almost all of these are detected with latencies short enough to use hardware checkpointing, thereby enabling OS recovery in virtually all such cases.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "architecture; error detection; fault injection; permanent fault", } @Article{Suleman:2008:FDT, author = "M. Aater Suleman and Moinuddin K. Qureshi and Yale N. Patt", title = "Feedback-driven threading: power-efficient and high-performance execution of multi-threaded workloads on {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "277--286", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346317", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Extracting high-performance from the emerging Chip Multiprocessors (CMPs) requires that the application be divided into multiple threads. Each thread executes on a separate core thereby increasing concurrency and improving performance. As the number of cores on a CMP continues to increase, the performance of some multi-threaded applications will benefit from the increased number of threads, whereas, the performance of other multi-threaded applications will become limited by data-synchronization and off-chip bandwidth. For applications that get limited by data-synchronization, increasing the number of threads significantly degrades performance and increases on-chip power. Similarly, for applications that get limited by off-chip bandwidth, increasing the number of threads increases on-chip power without providing any performance improvement. Furthermore, whether an application gets limited by data-synchronization, or bandwidth, or neither depends not only on the application but also on the input set and the machine configuration. Therefore, controlling the number of threads based on the run-time behavior of the application can significantly improve performance and reduce power.\par This paper proposes Feedback-Driven Threading (FDT), a framework to dynamically control the number of threads using run-time information. FDT can be used to implement Synchronization-Aware Threading (SAT), which predicts the optimal number of threads depending on the amount of data-synchronization. Our evaluation shows that SAT can reduce both execution time and power by up to 66\% and 78\% respectively. Similarly, FDT can be used to implement Bandwidth-Aware Threading (BAT), which predicts the minimum number of threads required to saturate the off-chip bus. Our evaluation shows that BAT reduces on-chip power by up to 78\%. When SAT and BAT are combined, the average execution time reduces by 17\% and power reduces by 59\%. The proposed techniques leverage existing performance counters and require minimal support from the threading library.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "bandwidth; CMP; multi-threaded; synchronization", } @Article{Linderman:2008:MPM, author = "Michael D. Linderman and Jamison D. Collins and Hong Wang and Teresa H. Meng", title = "{Merge}: a programming model for heterogeneous multi-core systems", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "287--296", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346318", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper we propose the Merge framework, a general purpose programming model for heterogeneous multi-core systems. The Merge framework replaces current ad hoc approaches to parallel programming on heterogeneous platforms with a rigorous, library-based methodology that can automatically distribute computation across heterogeneous cores to achieve increased energy and performance efficiency. The Merge framework provides (1) a predicate dispatch-based library system for managing and invoking function variants for multiple architectures; (2) a high-level, library-oriented parallel language based on map-reduce; and (3) a compiler and runtime which implement the map-reduce language pattern by dynamically selecting the best available function implementations for a given input and machine configuration. Using a generic sequencer architecture interface for heterogeneous accelerators, the Merge framework can integrate function variants for specialized accelerators, offering the potential for to-the-metal performance for a wide range of heterogeneous architectures, all transparent to the user. The Merge framework has been prototyped on a heterogeneous platform consisting of an Intel Core 2 Duo CPU and an 8-core 32-thread Intel Graphics and Media Accelerator X3000, and a homogeneous 32-way Unisys SMP system with Intel Xeon processors. We implemented a set of benchmarks using the Merge framework and enhanced the library with X3000 specific implementations, achieving speedups of 3.6x -- 8.5x using the X3000 and 5.2x -- 22x using the 32-way system relative to the straight C reference implementation on a single IA32 core.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "GPGPU; heterogeneous multi-core; predicate dispatch", } @Article{Gummaraju:2008:SPG, author = "Jayanth Gummaraju and Joel Coburn and Yoshio Turner and Mendel Rosenblum", title = "{Streamware}: programming general-purpose multicore processors using streams", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "297--307", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346319", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Recently, the number of cores on general-purpose processors has been increasing rapidly. Using conventional programming models, it is challenging to effectively exploit these cores for maximal performance. An interesting alternative candidate for programming multiple cores is the stream programming model, which provides a framework for writing programs in a sequential-style while greatly simplifying the task of automatic parallelization. It has been shown that not only traditional media/image applications but also more general-purpose data-intensive applications can be expressed in the stream programming style.\par In this paper, we investigate the potential to use the stream programming model to efficiently utilize commodity multicore general-purpose processors (e.g., Intel/AMD). Although several stream languages and stream compilers have recently been developed, they typically target special-purpose stream processors. In contrast, we propose a flexible software system, Streamware, which automatically maps stream programs onto a wide variety of general-purpose multicore processor configurations. We leverage existing compilation framework for stream processors and design a runtime environment which takes as input the output of these stream compilers in the form of machine-independent stream virtual machine code. The runtime environment assigns work to processor cores considering processor/cache configurations and adapts to workload variations. We evaluate this approach for a few general-purpose scientific applications on real hardware and a cycle-level simulator set-up to showcase scaling and contention issues. The results show that the stream programming model is a good choice for efficiently exploiting modern and future multicore CPUs for an important class of applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "general-purpose multicore processors; programming; runtime system; streams", } @Article{Nightingale:2008:PSC, author = "Edmund B. Nightingale and Daniel Peek and Peter M. Chen and Jason Flinn", title = "Parallelizing security checks on commodity hardware", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "308--318", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346321", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Speck (Speculative Parallel Check) is a system that accelerates powerful security checks on commodity hardware by executing them in parallel on multiple cores. Speck provides an infrastructure that allows sequential invocations of a particular security check to run in parallel without sacrificing the safety of the system. Speck creates parallelism in two ways. First, Speck decouples a security check from an application by continuing the application, using speculative execution, while the security check executes in parallel on another core. Second, Speck creates parallelism between sequential invocations of a security check by running later checks in parallel with earlier ones. Speck provides a process-level replay system to deterministically and efficiently synchronize state between a security check and the original process. We use Speck to parallelize three security checks: sensitive data analysis, on-access virus scanning, and taint propagation. Running on a 4-core and an 8-core computer, Speck improves performance 4x and 7.5x for the sensitive data analysis check, 3.3x and 2.8x for the on-access virus scanning check, and 1.6x and 2x for the taint propagation check.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "operating systems; parallel; performance; security; speculative execution", } @Article{Castro:2008:BBR, author = "Miguel Castro and Manuel Costa and Jean-Philippe Martin", title = "Better bug reporting with better privacy", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "319--328", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346322", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Software vendors collect bug reports from customers to improve the quality of their software. These reports should include the inputs that make the software fail, to enable vendors to reproduce the bug. However, vendors rarely include these inputs in reports because they may contain private user data. We describe a solution to this problem that provides software vendors with new input values that satisfy the conditions required to make the software follow the same execution path until it fails, but are otherwise unrelated with the original inputs. These new inputs allow vendors to reproduce the bug while revealing less private information than existing approaches. Additionally, we provide a mechanism to measure the amount of information revealed in an error report. This mechanism allows users to perform informed decisions on whether or not to submit reports. We implemented a prototype of our solution and evaluated it with real errors in real programs. The results show that we can produce error reports that allow software vendors to reproduce bugs while revealing almost no private information.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "bug reports; constraint solving; privacy; symbolic execution", } @Article{Lu:2008:LMC, author = "Shan Lu and Soyeon Park and Eunsoo Seo and Yuanyuan Zhou", title = "Learning from mistakes: a comprehensive study on real world concurrency bug characteristics", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "329--339", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353536.1346323", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The reality of multi-core hardware has made concurrent programs pervasive. Unfortunately, writing correct concurrent programs is difficult. Addressing this challenge requires advances in multiple directions, including concurrency bug detection, concurrent program testing, concurrent programming model design, etc. Designing effective techniques in all these directions will significantly benefit from a deep understanding of real world concurrency bug characteristics.\par This paper provides the first (to the best of our knowledge) comprehensive real world concurrency bug characteristic study. Specifically, we have carefully examined concurrency bug patterns, manifestation, and fix strategies of 105 randomly selected real world concurrency bugs from 4 representative server and client open-source applications (MySQL, Apache, Mozilla and OpenOffice). Our study reveals several interesting findings and provides useful guidance for concurrency bug detection, testing, and concurrent programming language design.\par Some of our findings are as follows: (1) Around one third of the examined non-deadlock concurrency bugs are caused by violation to programmers' order intentions, which may not be easily expressed via synchronization primitives like locks and transactional memories; (2) Around 34\% of the examined non-deadlock concurrency bugs involve multiple variables, which are not well addressed by existing bug detection tools; (3) About 92\% of the examined concurrency bugs can be reliably triggered by enforcing certain orders among no more than 4 memory accesses. This indicates that testing concurrent programs can target at exploring possible orders among every small groups of memory accesses, instead of among all memory accesses; (4) About 73\% of the examined non-deadlock concurrency bugs were not fixed by simply adding or changing locks, and many of the fixes were not correct at the first try, indicating the difficulty of reasoning concurrent execution by programmers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "bug characteristics; concurrency bug; concurrent program", } @Article{Anonymous:2008:MGC, author = "Anonymous", title = "Message from the {General Chairs}", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "x--x", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382166", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2008:MPC, author = "Anonymous", title = "Message from the {Program Chair}", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "xi--xi", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382167", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2008:R, author = "Anonymous", title = "Reviewers", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "xv--xviii", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382168", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tseng:2008:AOP, author = "Francis Tseng and Yale N. Patt", title = "Achieving Out-of-Order Performance with Almost In-Order Complexity", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "3--12", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382169", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "There is still much performance to be gained by out-of-order processors with wider issue widths. However, traditional methods of increasing issue width do not scale; that is, they drastically increase design complexity and power requirements. This paper introduces the braid, a compile-time identified entity that enables the execution core to scale to wider widths by exploiting the small fanout and short lifetime of values produced by the program. Braid processing requires identification by the compiler, minor extensions to the ISA, and support by the microarchitecture. The result from processing braids is performance within 9\% of a very aggressive conventional out-of-order microarchitecture with almost the complexity of an in-order implementation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Agarwal:2008:FCR, author = "Mayank Agarwal and Nitin Navale and Kshitiz Malik and Matthew I. Frank", title = "Fetch-Criticality Reduction through Control Independence", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "13--24", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1109/ISCA.2008.39", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Architectures that exploit control independence (CI) promise to remove in-order fetch bottlenecks, like branch mispredicts, instruction-cache misses and fetch unit stalls, from the critical path of single-threaded execution. By exposing more fetch options, however, CI architectures also expose more performance tradeoffs. These tradeoffs make it hard to design policies that deliver good performance. This paper presents a criticality-based model for reasoning about CI architectures, and uses that model to describe the tradeoffs between gains from control independence versus increased costs of honoring data dependences. The model is then used to derive the design of a criticality-aware task selection policy that strikes the right balance between fetch-criticality and execute-criticality. Finally, the paper validates the model by attacking branch-misprediction induced fetch-criticality through the above derived spawn policy. This leads to as high as 100\% improvements in performance, and in the region of 40\% or more improvements for four of the benchmarks where this is the main problem. Criticality analysis shows that this improvement arises due to reduced fetch-criticality.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "control independence; fetch-criticality; implicit parallelization", } @Article{Pericas:2008:TLL, author = "Miquel Peric{\`a}s and Adrian Cristal and Francisco J. Cazorla and Ruben Gonz{\'a}lez and Alex Veidenbaum and Daniel A. Jim{\'e}nez and Mateo Valero", title = "A Two-Level Load\slash Store Queue Based on Execution Locality", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "25--36", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1109/ISCA.2008.10", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Multicore processors have emerged as a powerful platform on which to efficiently exploit thread-level parallelism (TLP). However, due to Amdahl's Law, such designs will be increasingly limited by the remaining sequential components of applications. To overcome this limitation it is necessary to design processors with many lower-performance cores for TLP and some high-performance cores designed to execute sequential algorithms. Such cores will need to address the memory-wall by implementing kilo-instruction windows. Large window processors require large Load/Store Queues that would be too slow if implemented using current CAM-based designs. This paper proposes an Epoch-based Load Store Queue (ELSQ), a new design based on Execution Locality. It is integrated into a large-window processor that has a fast, out-of-order core operating only on L1/L2 cache hits and N slower cores that process L2 misses and their dependent instructions. The large LSQ is coupled with the slow cores and is partitioned into N small and local LSQs, one per core. We evaluate ELSQ in a large-window environment, finding that it enables high performance at low power. By exploiting locality among loads and stores, ELSQ outperforms even an idealized central LSQ when implemented on top of a decoupled processor design.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "execution locality; kilo-instruction processors; load/store queue; power-efficiency", } @Article{Ipek:2008:SOM, author = "Engin Ipek and Onur Mutlu and Jos{\'e} F. Mart{\'\i}nez and Rich Caruana", title = "Self-Optimizing Memory Controllers: a Reinforcement Learning Approach", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "39--50", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382172", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Efficiently utilizing off-chip DRAM bandwidth is a critical issue in designing cost-effective, high-performance chip multiprocessors(CMPs). Conventional memory controllers deliver relatively low performance in part because they often employ fixed,rigid access scheduling policies designed for average-case application behavior. As a result, they cannot learn and optimize the long-term performance impact of their scheduling decisions,and cannot adapt their scheduling policies to dynamic workload behavior. We propose a new, self-optimizing memory controller design that operates using the principles of reinforcement learning (RL)to overcome these limitations. Our RL-based memory controller observes the system state and estimates the long-term performance impact of each action it can take. In this way, the controller learns to optimize its scheduling policy on the fly to maximize long-term performance. Our results show that an RL-based memory controller improves the performance of a set of parallel applications run on a 4-core CMP by 19\% on average (upto 33\%), and it improves DRAM bandwidth utilization by 22\%compared to a state-of-the-art controller.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "chip multiprocessors; machine learning; memory controller; memory systems; reinforcement learning", } @Article{Thoziyoor:2008:CMM, author = "Shyamkumar Thoziyoor and Jung Ho Ahn and Matteo Monchiero and Jay B. Brockman and Norman P. Jouppi", title = "A Comprehensive Memory Modeling Tool and Its Application to the Design and Analysis of Future Memory Hierarchies", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "51--62", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1109/ISCA.2008.16", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper we introduce CACTI-D, a significant enhancement of CACTI 5.0. CACTI-D adds support for modeling of commodity DRAM technology and support for main memory DRAM chip organization. CACTI-D enables modeling of the complete memory hierarchy with consistent models all the way from SRAM based L1 caches through main memory DRAMs on DIMMs. We illustrate the potential applicability of CACTI-D in the design and analysis of future memory hierarchies by carrying out a last level cache study for a multicore multithreaded architecture at the 32nm technology node. In this study we use CACTI-D to model all components of the memory hierarchy including L1, L2, last level SRAM, logic process based DRAM or commodity DRAM L3 caches, and main memory DRAM chips. We carry out architectural simulation using benchmarks with large data sets and present results of their execution time, breakdown of power in the memory hierarchy, and system energy-delay product for the different system configurations. We find that commodity DRAM technology is most attractive for stacked last level caches, with significantly lower energy-delay products.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cache; CACTI; commodity DRAM; LLC; logic-process based DRAM; SRAM", } @Article{Mutlu:2008:PAB, author = "Onur Mutlu and Thomas Moscibroda", title = "Parallelism-Aware Batch Scheduling: Enhancing both Performance and Fairness of Shared {DRAM} Systems", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "63--74", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382128", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In a chip-multiprocessor (CMP) system, the DRAM system is shared among cores. In a shared DRAM system, requests from a thread can not only delay requests from other threads by causing bank/bus/row-buffer conflicts but they can also destroy other threads' DRAM-bank-level parallelism. Requests whose latencies would otherwise have been overlapped could effectively become serialized. As a result both fairness and system throughput degrade, and some threads can starve for long time periods. This paper proposes a fundamentally new approach to designing a shared DRAM controller that provides quality of service to threads,while also improving system throughput. Our parallelism-aware batch scheduler (PAR-BS) design is based on two key ideas. First, PARBS processes DRAM requests in batches to provide fairness and to avoid starvation of requests. Second, to optimize system throughput,PAR-BS employs a parallelism-aware DRAM scheduling policy that aims to process requests from a thread in parallel in the DRAM banks, thereby reducing the memory-related stall-time experienced by the thread. PAR-BS seamlessly incorporates support for system-level thread priorities and can provide different service levels, including purely opportunistic service, to threads with different priorities. We evaluate the design trade-offs involved in PAR-BS and compare it to four previously proposed DRAM scheduler designs on 4-, 8-, and16-core systems. Our evaluations show that, averaged over 100 4-core workloads, PAR-BS improves fairness by 1.11X and system throughput by 8.3\% compared to the best previous scheduling technique, Stall-Time Fair Memory (STFM) scheduling. Based on simple request prioritization rules, PAR-BS is also simpler to implement than STFM.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "chip multiprocessors; DRAM systems; fairness; memory scheduling; memory systems; memory-level parallelism; multi-core systems; quality of service", } @Article{Kim:2008:TDH, author = "John Kim and William J. Dally and Steve Scott and Dennis Abts", title = "Technology-Driven, Highly-Scalable {Dragonfly} Topology", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "77--88", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1109/ISCA.2008.19", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Evolving technology and increasing pin-bandwidth motivate the use of high-radix routers to reduce the diameter, latency, and cost of interconnection networks. High-radix networks, however, require longer cables than their low-radix counterparts. Because cables dominate network cost, the number of cables, and particularly the number of long, global cables should be minimized to realize an efficient network. In this paper, we introduce the dragonfly topology which uses a group of high-radix routers as a virtual router to increase the effective radix of the network. With this organization, each minimally routed packet traverses at most one global channel. By reducing global channels, a dragonfly reduces cost by 20\% compared to a flattened butterfly and by 52\% compared to a folded Clos network in configurations with $ \geq $ 16K nodes. We also introduce two new variants of global adaptive routing that enable load-balanced routing in the dragonfly. Each router in a dragonfly must make an adaptive routing decision based on the state of a global channel connected to a different router. Because of the indirect nature of this routing decision, conventional adaptive routing algorithms give degraded performance. We introduce the use of selective virtual-channel discrimination and the use of credit round-trip latency to both sense and signal channel congestion. The combination of these two methods gives throughput and latency that approaches that of an ideal adaptive routing algorithm.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "dragonfly; interconnection networks; topology", } @Article{Lee:2008:GSF, author = "Jae W. Lee and Man Cheuk Ng and Krste Asanovic", title = "Globally-Synchronized Frames for Guaranteed Quality-of-Service in On-Chip Networks", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "89--100", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382130", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Future chip multiprocessors (CMPs) may have hundreds to thousands of threads competing to access shared resources, and will require quality-of-service (QoS) support to improve system utilization. Although there has been significant work in QoS support within resources such as caches and memory controllers, there has been less attention paid to QoS support in the multi-hop on-chip networks that will form an important component in future systems. In this paper we introduce Globally-Synchronized Frames (GSF), a framework for providing guaranteed QoS in on-chip networks in terms of minimum bandwidth and a maximum delay bound. The GSF framework can be easily integrated in a conventional virtual channel (VC) router without significantly increasing the hardware complexity. We rely on a fast barrier network, which is feasible in an on-chip environment, to efficiently implement GSF. Performance guarantees are verified by both analysis and simulation. According to our simulations, all concurrent flows receive their guaranteed minimum share of bandwidth in compliance with a given bandwidth allocation. The average throughput degradation of GSF on a 8x8 mesh network is within 10\% compared to the conventional best-effort VC router in most cases.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "chip multiprocessors; interconnects; multicores; on-chip network; quality-of-service; resource management; router; software interface", } @Article{Kim:2008:PCN, author = "Martha Mercaldi Kim and John D. Davis and Mark Oskin and Todd Austin", title = "Polymorphic On-Chip Networks", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "101--112", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1109/ISCA.2008.25", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As the number of cores per die increases, be they processors, memory blocks, or custom accelerators, the on-chip interconnect the cores use to communicate gains importance. We begin this study with an area-performance analysis of the interconnect design space. We find that there is no single network design that yields optimal performance across a range of traffic patterns. This indicates that there is an opportunity to gain performance by customizing the interconnect to a particular application or workload. We propose polymorphic on-chip networks to enable per-application network customization. This network can be configured prior to application runtime, to have the topology and buffering of arbitrary network designs. This paper proposes one such polymorphic network architecture. We demonstrate its modes of configurability, and evaluate the polymorphic network architecture design space, producing polymorphic fabrics that minimize the network area overhead. Finally, we expand the network on chip design space to include a polymorphic network design, showing that a single polymorphic network is capable of implementing all of the pareto optimal fixed-network designs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "configurable hardware; on-chip network", } @Article{Baugh:2008:UHM, author = "Lee Baugh and Naveen Neelakantam and Craig Zilles", title = "Using Hardware Memory Protection to Build a High-Performance, Strongly-Atomic Hybrid Transactional Memory", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "115--126", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382132", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We demonstrate how fine-grained memory protection can be used in support of transactional memory systems: first showing how a software transactional memory system (STM) can be made strongly atomic by using memory protection on transactionally-held state, then showing how such a strongly-atomic STM can be used with a bounded hardware TM system to build a hybrid TM system in which zero-overhead hardware transactions may safely run concurrently with potentially-conflicting software transactions. We experimentally demonstrate how this hybrid TM organization avoids the common-case overheads associated with previous hybrid TM proposals, achieving performance rivaling an unbounded HTM system without the hardware complexity of ensuring completion of arbitrary transactions in hardware. As part of our findings, we identify key policies regarding contention management within and across the hardware and software TM components that are key to achieving robust performance with a hybrid TM.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "abort handler; hybrid; memory protection; primitives; strong atomicity; transactional memory", } @Article{Bobba:2008:TEE, author = "Jayaram Bobba and Neelam Goyal and Mark D. Hill and Michael M. Swift and David A. Wood", title = "{TokenTM}: Efficient Execution of Large Transactions with Hardware Transactional Memory", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "127--138", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382133", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Current hardware transactional memory systems seek to simplify parallel programming, but assume that large transactions are rare, so it is acceptable to penalize their performance or concurrency. However, future programmers may wish to use large transactions more often in order to integrate with higher-level programming models (e.g., database transactions) or perform selected I/O operations. To prevent the 'small transactions are common' assumption from becoming self-fulfilling, this paper contributes TokenTM --- an unbounded HTM that uses the abstraction of tokens to precisely track conflicts on an unbounded number of memory blocks. TokenTM implements tokens with new mechanisms, including metastate fission/fusion and fast token release. TokenTM executes small transactions fast, executes concurrent large transactions with no penalty to nonconflicting transactions, and gracefully handles paging, context switching, and System-V-style shared memory.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "coherence protocols; hardware transactional memory; metastates; tokens; transactional memory; unbounded transactions", } @Article{Shriraman:2008:FDT, author = "Arrvindh Shriraman and Sandhya Dwarkadas and Michael L. Scott", title = "Flexible Decoupled Transactional Memory Support", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "139--150", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1109/ISCA.2008.17", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A high-concurrency transactional memory (TM) implementation needs to track concurrent accesses, buffer speculative updates, and manage conflicts. We present a system, FlexTM (FLEXible Transactional Memory), that coordinates four decoupled hardware mechanisms: read and write signatures, which summarize per-thread access sets; per-thread conflict summary tables (CSTs), which identify the threads with which conflicts have occurred; Programmable Data Isolation, which maintains speculative updates in the local cache and employs a thread-private buffer (in virtual memory) in the rare event of overflow; and Alert-On-Update, which selectively notifies threads about coherence events. All mechanisms are software-accessible, to enable virtualization and to support transactions of arbitrary length. FlexTM allows software to determine when to manage conflicts (either eagerly or lazily), and to employ a variety of conflict management and commit protocols. We describe an STM-inspired protocol that uses CSTs to manage conflicts in a distributed manner (no global arbitration) and allows parallel commits. In experiments with a prototype on Simics/GEMS, FlexTM exhibits 5x speedup over high-quality software TM, with no loss in policy flexibility. Its distributed commit protocol is also more efficient than a central hardware manager. Our results highlight the importance of flexibility in determining when to manage conflicts: lazy maximizes concurrency and helps to ensure forward progress while eager provides better overall utilization in a multi-programmed system.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cache coherence; Conflict detection; FlexTM; Hardware; Multiprocessors; RTM; Transactional memory", } @Article{Vantrease:2008:CSI, author = "Dana Vantrease and Robert Schreiber and Matteo Monchiero and Moray McLaren and Norman P. Jouppi and Marco Fiorentino and Al Davis and Nathan Binkert and Raymond G. Beausoleil and Jung Ho Ahn", title = "{Corona}: System Implications of Emerging Nanophotonic Technology", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "153--164", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382135", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We expect that many-core microprocessors will push performance per chip from the 10 gigaflop to the 10 teraflop range in the coming decade. To support this increased performance, memory and inter-core bandwidths will also have to scale by orders of magnitude. Pin limitations, the energy cost of electrical signaling, and the non-scalability of chip-length global wires are significant bandwidth impediments. Recent developments in silicon nanophotonic technology have the potential to meet these off- and on-stack bandwidth requirements at acceptable power levels. Corona is a 3D many-core architecture that uses nanophotonic communication for both inter-core communication and off-stack communication to memory or I/O devices. Its peak floating-point performance is 10 teraflops. Dense wavelength division multiplexed optically connected memory modules provide 10 terabyte per second memory bandwidth. A photonic crossbar fully interconnects its 256 low-power multithreaded cores at 20 terabyte per second bandwidth. We have simulated a 1024 thread Corona system running synthetic benchmarks and scaled versions of the SPLASH-2 benchmark suite. We believe that in comparison with an electrically-connected many-core alternative that uses the same on-stack interconnect power, Corona can provide 2 to 6 times more performance on many memory intensive workloads, while simultaneously reducing power.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "3D stacking; many-core CMP; nanophotonics; on-chip Networks", } @Article{Kreger-Stickles:2008:MAI, author = "Lucas Kreger-Stickles and Mark Oskin", title = "Microcoded Architectures for Ion-Tap Quantum Computers", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "165--176", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382136", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper we present the first ever systematic design space exploration of microcoded software fault tolerant ion-trap quantum computers. This exploration reveals the critical importance of a well-tuned microcode for providing high performance and ensuring system reliability. In addition, we find that, despite recent advances in the reliability of quantum memory, the impact of errors due to stored quantum data is now, and will continue to be, a major source of systemic error. Finally, our exploration reveals a single design which out performs all others we considered in run time, fidelity and area. For completeness our design space exploration includes designs from prior work and we find a novel design that is 1/2 the size, 3 times as fast, and an order of magnitude more reliable.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "architecture; ion-trap; microcoded; quantum", } @Article{Isailovic:2008:RQC, author = "Nemanja Isailovic and Mark Whitney and Yatish Patel and John Kubiatowicz", title = "Running a Quantum Circuit at the Speed of Data", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "177--188", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382137", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We analyze circuits for kernels from popular quantum computing applications, characterizing the hardware resources necessary to take ancilla preparation off the critical path. The result is a chip entirely dominated by ancilla generation circuits. To address this issue, we introduce optimized ancilla factories and analyze their structure and physical layout for ion trap technology. We introduce a new quantum computing architecture with highly concentrated data-only regions surrounded by shared ancilla factories. The results are a reduced dependence on costly teleportation, more efficient distribution of generated ancillae and more than five times speedup over previous proposals.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "ancilla factory; microarchitecture; quantum", } @Article{Liang:2008:RVT, author = "Xiaoyao Liang and Gu-Yeon Wei and David Brooks", title = "{ReVIVaL}: a Variation-Tolerant Architecture Using Voltage Interpolation and Variable Latency", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "191--202", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382138", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Process variations are poised to significantly degrade performance benefits sought by moving to the next nanoscale technology node. Parameter fluctuations in devices can introduce large variations in peak operation among chips, among cores on a single chip, and among microarchitectural blocks within one core. Hence, it will be difficult to only rely on traditional frequency binning to efficiently cover the large variations that are expected. Furthermore, multiple voltage/frequency domains introduce significant hardware overhead and alone cannot address the full extent of delay variations expected in future multi-core systems. In this paper, we present ReVIVaL, which combines two fine-grained post-fabrication tuning techniques---voltage interpolation(VI) and variable latency(VL). We show that the frequency variation between chips, between cores on one chip, and between functional units within cores can be reduced to a very small range. The effectiveness of these techniques are further verified through experiments on test chips fabricated in a 130nm CMOS process. Detailed architectural simulations of multi-core processors demonstrate significant performance and power advantages are possible by combining variable latency with voltage interpolation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "chip multiprocessor; microarchitecture; process variations", } @Article{Wilkerson:2008:TCC, author = "Chris Wilkerson and Hongliang Gao and Alaa R. Alameldeen and Zeshan Chishti and Muhammad Khellah and Shih-Lien Lu", title = "Trading off Cache Capacity for Reliability to Enable Low Voltage Operation", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "203--214", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382139", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "One of the most effective techniques to reduce a processor's power consumption is to reduce supply voltage. However, reducing voltage in the context of manufacturing-induced parameter variations can cause many types of memory circuits to fail. As a result, voltage scaling is limited by a minimum voltage, often called Vccmin, beyond which circuits may not operate reliably. Large memory structures (e.g., caches) typically set Vccmin for the whole processor. In this paper, we propose two architectural techniques that enable microprocessor caches (L1and L2), to operate at low voltages despite very high memory cell failure rates. The Word-disable scheme combines two consecutive cache lines, to form a single cache line where only non-failing words are used. The Bit-fix scheme uses a quarter of the ways in a cache set to store positions and fix bits for failing bits in other ways of the set. During high voltage operation, both schemes allow use of the entire cache. During low voltage operation, they sacrifice cache capacity by 50\% and 25\%, respectively, to reduce Vccmin below 500mV. Compared to current designs with a Vccmin of 825mV, our schemes enable a 40\% voltage reduction, which reduces power by 85\% and energy per instruction (EPI) by 53\%", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cache; cache design; low power; low voltage; reliability; SRAM; stability; Vccmin", } @Article{Roesner:2008:CDP, author = "Franziska Roesner and Doug Burger and Stephen W. Keckler", title = "Counting Dependence Predictors", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "215--226", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382140", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Modern processors rely on memory dependence prediction to execute load instructions as early as possible, speculating that they are not dependent on an earlier, unissued store. To date, the most sophisticated dependence predictors, such as Store Sets, have been tightly coupled to the fetch and execution streams, requiring global knowledge of the in-flight stream of stores to synchronize loads with specific stores. This paper proposes a new dependence predictor design, called a Counting Dependence Predictor (CDP). The key feature of CDPs is that the prediction mechanism predicts some set of events for which a particular dynamic load should wait, which may include some number of matching stores. By waiting for local events only, this dependence predictor can work effectively in a distributed microarchitecture where centralized fetch and execution streams are infeasible or undesirable. We describe and evaluate a distributed Counting Dependence Predictor and protocol that achieves 92\% of the performance of perfect memory disambiguation. It outperforms a load-wait table, similar to the Alpha 21264, by 11\%. Idealized, centralized implementations of Store Sets and the Exclusive Collision Predictor, both of which would be difficult to implement in a distributed microarchitecture, achieve 97\% and 94\% of oracular performance, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "dependence prediction; memory systems; multiprocessor and multicore architectures", } @Article{Jerger:2008:VCT, author = "Natalie Enright Jerger and Li-Shiuan Peh and Mikko Lipasti", title = "Virtual Circuit Tree Multicasting: a Case for On-Chip Hardware Multicast Support", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "229--240", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382141", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Current state-of-the-art on-chip networks provide efficiency, high throughput, and low latency for one-to-one (unicast) traffic. The presence of one-to-many (multicast) or one-to-all (broadcast) traffic can significantly degrade the performance of these designs, since they rely on multiple unicasts to provide one-to-many communication. This results in a burst of packets from a single source and is a very inefficient way of performing multicast and broadcast communication. This inefficiency is compounded by the proliferation of architectures and coherence protocols that require multicast and broadcast communication. In this paper, we characterize a wide array of on-chip communication scenarios that benefit from hardware multicast support. We propose Virtual Circuit Tree Multicasting (VCTM) and present a detailed multicast router design that improves network performance by up to 90\% while reducing network activity (hence power) by up to 53\%. Our VCTM router is flexible enough to improve interconnect performance for a broad spectrum of multicasting scenarios,and achieves these benefits with straightforward and inexpensive extensions to a state-of-the-art packet-switched router.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cache coherence protocol; interconnection network; multiprocessor", } @Article{Kodi:2008:IIR, author = "Avinash Karanth Kodi and Ashwini Sarathy and Ahmed Louri", title = "{iDEAL}: Inter-router Dual-Function Energy and Area-Efficient Links for Network-on-Chip {(NoC)} Architectures", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "241--250", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382142", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Network-on-Chip (NoC) architectures have been adopted by a growing number of multi-core designs as a flexible and scalable solution to the increasing wire delay constraints in the deep sub-micron regime. However, the shrinking feature size limits the performance of NoCs due to power and area constraints. Research into the optimization of NoCs has shown that a reduction in the number of buffers in the NoC routers reduces the power and area overhead but degrades the network performance. In this paper, we propose iDEAL, a low-power area-efficient NoC architecture by reducing the number of buffers within the router. To overcome the performance degradation caused by the reduced buffer size, we propose to use adaptive dual-function links capable of data transmission as well as data storage when required. Simulation results for the proposed architecture show that reducing the router buffer size in half and using the adaptive dual-function links achieves nearly 40\% savings in buffer power, 30\% savings in overall network power and about 41\% savings in the router area, with only a marginal 1-3\% drop in performance. Moreover, the performance in iDEAL can be further improved by aggressive and speculative flow control techniques.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "interconnects; low-power architecture; network-on-chip", } @Article{Park:2008:MML, author = "Dongkook Park and Soumya Eachempati and Reetuparna Das and Asit K. Mishra and Yuan Xie and N. Vijaykrishnan and Chita R. Das", title = "{MIRA}: a Multi-layered On-Chip Interconnect Router Architecture", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "251--261", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1109/ISCA.2008.13", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Recently, Network-on-Chip (NoC) architectures have gained popularity to address the interconnect delay problem for designing CMP / multi-core / SoC systems in deep sub-micron technology. However, almost all prior studies have focused on 2D NoC designs. Since three dimensional (3D) integration has emerged to mitigate the interconnect delay problem, exploring the NoC design space in 3D can provide ample opportunities to design high performance and energy-efficient NoC architectures. In this paper, we propose a 3D stacked NoC router architecture, called MIRA, which unlike the 3D routers in previous works, is stacked into multiple layers and optimized to reduce the overall area requirements and power consumption. We discuss the design details of a four-layer 3D NoC and its enhanced version with additional express channels, and compare them against a ($ 6 \mu $) 2D design and a baseline 3D design. All the designs are evaluated using a cycle-accurate 3D NoC simulator, and integrated with the Orion power model for performance and power analysis. The simulation results with synthetic and application traces demonstrate that the proposed multi-layered NoC routers can outperform the 2D and na{\"\i}ve 3D designs in terms of performance and power. It can achieve up to 42\% reduction in power consumption and up to 51\% improvement in average latency with synthetic workloads. With real workloads, these benefits are around 67\% and 38\%, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "3D; express channel; express path; Network-on-Chip; NoC; on-chip interconnect; router architecture", } @Article{Hower:2008:REE, author = "Derek R. Hower and Mark D. Hill", title = "{Rerun}: Exploiting Episodes for Lightweight Memory Race Recording", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "265--276", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382144", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Multiprocessor deterministic replay has many potential uses in the era of multicore computing, including enhanced debugging, fault tolerance, and intrusion detection. While sources of nondeterminism in a uniprocessor can be recorded efficiently in software, it seems likely that hardware support will be needed in a multiprocessor environment where the outcome of memory races must also be recorded. We develop a memory race recording mechanism, called Rerun, that uses small hardware state ($ \approx 166 $ bytes/core), writes a small race log ($ \approx 4 $ bytes/kilo- instruction), and operates well as the number of cores per system scales (e.g., to 16 cores). Rerun exploits the dual of conventional wisdom in race recording: Rather than record information about individual memory accesses that conflict, we record how long a thread executes without conflicting with other threads. In particular, Rerun passively creates atomic episodes. Each episode is a dynamic instruction sequence that a thread happens to execute without interacting with other threads. Rerun uses Lamport Clocks to order episodes and enable replay of an equivalent execution.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "determinism; multicore; race recording", } @Article{Lucia:2008:AAD, author = "Brandon Lucia and Joseph Devietti and Karin Strauss and Luis Ceze", title = "{Atom-Aid}: Detecting and Surviving Atomicity Violations", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "277--288", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382145", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Writing shared-memory parallel programs is error-prone. Among the concurrency errors that programmers often face are atomicity violations, which are especially challenging. They happen when programmers make incorrect assumptions about atomicity and fail to enclose memory accesses that should occur atomically inside the same critical section. If these accesses happen to be interleaved with conflicting accesses from different threads, the program might behave incorrectly. Recent architectural proposals arbitrarily group consecutive dynamic memory operations into atomic blocks to enforce memory ordering at a coarse grain. This provides what we call implicit atomicity, as the atomic blocks are not derived from explicit program annotations. In this paper, we make the fundamental observation that implicit atomicity probabilistically hides atomicity violations by reducing the number of interleaving opportunities between memory operations. We then propose Atom-Aid, which creates implicit atomic blocks intelligently instead of arbitrarily, dramatically reducing the probability that atomicity violations will manifest themselves. Atom-Aid is also able to report where atomicity violations might exist in the code, providing resilience and debuggability. We evaluate Atom-Aid using buggy code from applications including Apache, MySQL, and XMMS, showing that Atom-Aid virtually eliminates the manifestation of atomicity violations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "bug; multiprocessors; parallel programming; software reliability", } @Article{Montesinos:2008:DRD, author = "Pablo Montesinos and Luis Ceze and Josep Torrellas", title = "{DeLorean}: Recording and Deterministically Replaying Shared-Memory Multiprocessor Execution Efficiently", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "289--300", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1109/ISCA.2008.36", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Support for deterministic replay of multithreaded execution can greatly help in finding concurrency bugs. For highest effectiveness, replay schemes should (i) record at production-run speed, (ii) keep their logging requirements minute, and (iii) replay at a speed similar to that of the initial execution. In this paper, we propose a new substrate for deterministic replay that provides substantial advances along these axes. In our proposal, processors execute blocks of instructions atomically, as in transactional memory or speculative multithreading, and the system only needs to record the commit order of these blocks. We call our scheme DeLorean. Our results show that DeLorean records execution at a speed similar to that of Release Consistency (RC) execution and replays at about 82\% of its speed. In contrast, most current schemes only record at the speed of Sequential Consistency (SC) execution. Moreover, DeLorean only needs 7.5\% of the log size needed by a state-of-the-art scheme. Finally, DeLorean can be configured to need only 0.6\% of the log size of the state-of-the-art scheme at the cost of recording at 86\% of RC's execution speed --- still faster than SC. In this configuration, the log of an 8-processor 5-GHz machine is estimated to be only about 20GB per day.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sankar:2008:IDP, author = "Sriram Sankar and Sudhanva Gurumurthi and Mircea R. Stan", title = "Intra-disk Parallelism: An Idea Whose Time Has Come", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "303--314", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382147", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Server storage systems use a large number of disks to achieve high performance, thereby consuming a significant amount of power. In this paper, we propose to significantly reduce the power consumed by such storage systems via intra-disk parallelism, wherein disk drives can exploit parallelism in the I/O request stream. Intra-disk parallelism can facilitate replacing a large disk array with a smaller one, using the minimum number of disk drives needed to satisfy the capacity requirements. We show that the design space of intra-disk parallelism is large and present a taxonomy to formulate specific implementations within this space. Using a set of commercial workloads, we perform a limit study to identify the key performance bottlenecks that arise when we replace a storage array that is tuned to provide high performance with a single high-capacity disk drive. We show that it is possible to match, and even surpass, the performance of a storage array for these workloads by using a single disk drive of sufficient capacity that exploits intra-disk parallelism, while significantly reducing the power consumed by the storage system. We evaluate the performance and power consumption of disk arrays composed of intra-disk parallel drives, and discuss engineering and cost issues related to the implementation and deployment of such disk drives.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "disk; I/O; parallelism; power; storage", } @Article{Lim:2008:UDN, author = "Kevin Lim and Parthasarathy Ranganathan and Jichuan Chang and Chandrakant Patel and Trevor Mudge and Steven Reinhardt", title = "Understanding and Designing New Server Architectures for Emerging Warehouse-Computing Environments", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "315--326", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382148", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper seeks to understand and design next-generation servers for emerging 'warehouse-computing' environments. We make two key contributions. First, we put together a detailed evaluation infrastructure including a new benchmark suite for warehouse-computing workloads, and detailed performance, cost, and power models, to quantitatively characterize bottlenecks. Second, we study a new solution that incorporates volume non-server-class components in novel packaging solutions, with memory sharing and flash-based disk caching. Our results show that this approach has promise, with a 2X improvement on average in performance-per-dollar for our benchmark suite.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "evaluation; server architecture; warehouse-computing", } @Article{Kgil:2008:INF, author = "Taeho Kgil and David Roberts and Trevor Mudge", title = "Improving {NAND} Flash Based Disk Caches", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "327--338", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1109/ISCA.2008.32", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Flash is a widely used storage device that provides high density and low power, appealing properties for general purpose computing. Today, its usual application is in portable special purpose devices such as MP3 players. In this paper we examine its use in the server domain --- a more general purpose environment. Aggressive process scaling and the use of multi-level cells continues to improve density ahead of Moore's Law predictions, making Flash even more attractive as a general purpose memory solution. Unfortunately, reliability limits the use of Flash. To seriously consider Flash in the server domain, architectural support must exist to address this concern. This paper first shows how Flash can be used in today's server platforms as a disk cache. It then proposes two improvements. The first improves performance and reliability by splitting Flash based disk caches into separate read and write regions. The second improves reliability by employing a programmable Flash memory controller. It can change the error code strength (number of correctable bits) and the number of bits that a memory cell can store (cell density) according to the demands of the application. Our studies show that Flash reduces overall power consumed by the system memory and hard disk drive up to 3 times while maintaining performance. We also show that Flash lifetime can be improved by a factor of 20 when using a programmable Flash memory controller, if some performance degradation (below 5\%) is acceptable.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "data center; disk cache; Flash; Flash memory controller; NAND Flash", } @Article{Li:2008:OEA, author = "Xiaodong Li and Sarita V. Adve and Pradip Bose and Jude A. Rivers", title = "Online Estimation of Architectural Vulnerability Factor for Soft Errors", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "341--352", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382150", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As CMOS technology scales and more transistors are packed on to the same chip, soft error reliability has become an increasingly important design issue for processors. Prior research has shown that there is significant architecture-level masking, and many soft error solutions take advantage of this effect. Prior work has also shown that the degree of such masking can vary significantly across workloads and between individual workload phases, motivating dynamic adaptation of reliability solutions for optimal cost and benefit. For such adaptation, it is important to be able to accurately estimate the amount of masking or the architecture vulnerability factor (AVF) online, while the program is running. Unfortunately, existing solutions for estimating AVF are often based on offline simulators and hard to implement in real processors. This paper proposes a novel way of estimating AVF online, using simple modifications to the processor. The estimation method applies to both logic and storage structures on the processor. Compared to previous methods for estimating AVF, our method does not require any offline simulation or calibration for different workloads. We tested our method with a widely used simulator from industry, for four processor structures and for 100 to 200 intervals of each of eleven SPEC benchmarks. The results show that our method provides acceptably accurate AVF estimates at runtime. The absolute error rarely exceeds 0.08 across all application intervals for all structures, and the mean absolute error for a given application and structure combination is always within 0.05.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "AVF estimation; processor reliability; soft error", } @Article{Shin:2008:PWR, author = "Jeonghee Shin and Victor Zyuban and Pradip Bose and Timothy M. Pinkston", title = "A Proactive Wearout Recovery Approach for Exploiting Microarchitectural Redundancy to Extend Cache {SRAM} Lifetime", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "353--362", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382151", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Microarchitectural redundancy has been proposed as a means of improving chip lifetime reliability. It is typically used in a reactive way, allowing chips to maintain operability in the presence of failures by detecting and isolating, correcting, and/or replacing components on a first-come, first-served basis only after they become faulty. In this paper, we explore an alternative, more preferred method of exploiting microarchitectural redundancy to enhance chip lifetime reliability. In our proposed approach, redundancy is used proactively to allow non-faulty microarchitecture components to be temporarily deactivated, on a rotating basis, to suspend and/or recover from certain wearout effects. This approach improves chip lifetime reliability by warding off the onset of wearout failures as opposed to reacting to them posteriorly. Applied to on-chip cache SRAM for combating NBTI-induced wearout failure, our proactive wearout recovery approach increases lifetime reliability (measured in mean-time-to-failure) of the cache by about a factor of seven relative to no use of microarchitectural redundancy and a factor of five relative to conventional reactive use of redundancy having similar area overhead.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "lifetime reliability; microarchitectural redundancy; proactive approach; wearout recovery", } @Article{Teodorescu:2008:VAA, author = "Radu Teodorescu and Josep Torrellas", title = "Variation-Aware Application Scheduling and Power Management for Chip Multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "363--374", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1109/ISCA.2008.40", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Within-die process variation causes individual cores in a ChipMultiprocessor (CMP) to differ substantially in both static power consumed and maximum frequency supported. In this environment,ignoring variation effects when scheduling applications or when managing power with Dynamic Voltage and Frequency Scaling (DVFS) is suboptimal. This paper proposes variation-aware algorithms for application scheduling and power management. One such power management algorithm, called {\em LinOpt}, uses linear programming to find the best voltage and frequency levels for each of the cores in the CMP --- maximizing throughput at a given power budget. In a 20-core CMP, the combination of variation-aware application scheduling and {\em LinOpt\/} increases the average throughput by 12--17\% and reduces the average $ E D^2 $ by 30--38\% --- all relative to using variation-aware scheduling together with a simple extension to Intel's Foxton power management algorithm.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "application scheduling; power management; process variation", } @Article{Chen:2008:FHA, author = "Shimin Chen and Michael Kozuch and Theodoros Strigkos and Babak Falsafi and Phillip B. Gibbons and Todd C. Mowry and Vijaya Ramachandran and Olatunji Ruwase and Michael Ryan and Evangelos Vlachos", title = "Flexible Hardware Acceleration for Instruction-Grain Program Monitoring", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "377--388", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382153", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Instruction-grain program monitoring tools, which check and analyze executing programs at the granularity of individual instructions, are invaluable for quickly detecting bugs and security attacks and then limiting their damage (via containment and/or recovery). Unfortunately, their fine-grain nature implies very high monitoring overheads for software-only tools, which are typically based on dynamic binary instrumentation. Previous hardware proposals either focus on mechanisms that target specific bugs or address only the cost of binary instrumentation. In this paper, we propose a flexible hardware solution for accelerating a wide range of instruction-grain monitoring tools. By examining a number of diverse tools (for memory checking, security tracking, and data race detection), we identify three significant common sources of overheads and then propose three novel hardware techniques for addressing these overheads: Inheritance Tracking, Idempotent Filters, and Metadata-TLBs. Together, these constitute a general-purpose hardware acceleration framework. Experimental results show our framework reduces overheads by 2-3X over the previous state-of-the-art, while supporting the needed flexibility.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "hardware acceleration; idempotent filter; inheritance tracking; instruction-grain program monitoring; LBA; lifeguards; log-based architectures; metadata-TLB", } @Article{Clark:2008:VVE, author = "Nathan Clark and Amir Hormati and Scott Mahlke", title = "{VEAL}: Virtualized Execution Accelerator for Loops", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "389--400", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1109/ISCA.2008.33", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Performance improvement solely through transistor scaling is becoming more and more difficult, thus it is increasingly common to see domain specific accelerators used in conjunction with general purpose processors to achieve future performance goals. There is a serious drawback to accelerators, though: binary compatibility. An application compiled to utilize an accelerator cannot run on a processor without that accelerator, and applications that do not utilize an accelerator will never use it. To overcome this problem, we propose decoupling the instruction set architecture from the underlying accelerators. Computation to be accelerated is expressed using a processor's baseline instruction set, and light-weight dynamic translation maps the representation to whatever accelerators are available in the system. In this paper, we describe the changes to a compilation framework and processor system needed to support this abstraction for an important set of accelerator designs that support innermost loops. In this analysis, we investigate the dynamic overheads associated with abstraction as well as the static/dynamic tradeoffs to improve the dynamic mapping of loop-nests. As part of the exploration, we also provide a quantitative analysis of the hardware characteristics of effective loop accelerators. We conclude that using a hybrid static-dynamic compilation approach to map computation on to loop-level accelerators is a practical way to increase computation efficiency, without the overheads associated with instruction set modification.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:2008:SSP, author = "Haibo Chen and Xi Wu and Liwei Yuan and Binyu Zang and Pen-chung Yew and Frederic T. Chong", title = "From Speculation to Security: Practical and Efficient Information Flow Tracking Using Speculative Hardware", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "401--412", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382156", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Dynamic information flow tracking (also known as taint tracking) is an appealing approach to combat various security attacks. However, the performance of applications can severely degrade without hardware support for tracking taints. This paper observes that information flow tracking can be efficiently emulated using deferred exception tracking in microprocessors supporting speculative execution. Based on this observation, we propose SHIFT, a low-overhead, software-based dynamic information flow tracking system to detect a wide range of attacks. The key idea is to treat tainted state (describing untrusted data) as speculative state (describing deferred exceptions). SHIFT leverages existing architectural support for speculative execution to track tainted state in registers and needs to instrument only load and store instructions to track tainted state in memory using a bitmap, which results in significant performance advantages. Moreover, by decoupling mechanisms for taint tracking from security policies, SHIFT can detect a wide range of exploits, including high-level semantic attacks. We have implemented SHIFT using the Itanium processor, which has support for deferred exceptions, and by modifying GCC to instrument loads and stores. A security assessment shows that SHIFT can detect both low-level memory corruption exploits as well as high-level semantic attacks with no false positives. Performance measurements show that SHIFT incurs about 1\% overhead for server applications. The performance slowdown for SPEC-INT2000 is 2.81X and 2.27X for tracking at byte-level and word-level respectively. Minor architectural improvements to the Itanium processor (adding three simple instructions) can reduce the performance slowdown down to 2.32X and 1.8X for byte-level and word-level tracking, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "deferred exception; dynamic information flow tracking; speculative execution; taint tracking", } @Article{Boneti:2008:SCP, author = "Carlos Boneti and Francisco J. Cazorla and Roberto Gioiosa and Alper Buyuktosunoglu and Chen-Yong Cher and Mateo Valero", title = "Software-Controlled Priority Characterization of {POWER5} Processor", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "415--426", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1109/ISCA.2008.8", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Due to the limitations of instruction-level parallelism, thread-level parallelism has become a popular way to improve processor performance. One example is the IBM POWER5TM processor, a two-context simultaneous-multithreaded dual-core chip. In each SMT core, the IBM POWER5 features two levels of thread resource balancing and prioritization. The first level provides automatic in-hardware resource balancing, while the second level is a software-controlled priority mechanism that presents eight levels of thread priorities. Currently, software-controlled prioritization is only used in limited number of cases in the software platforms due to lack of performance characterization of the effects of this mechanism. In this work, we characterize the effects of the software-based prioritization on several different workloads. We show that the impact of the prioritization significantly depends on the workloads coscheduled on a core. By prioritizing the right task, it is possible to obtain more than two times of throughput improvement for synthetic workloads compared to the baseline. We also present two application case studies targeting two different performance metrics: the first case study improves overall throughput by 23.7\% and the second case study reduces the total execution time by 9.3\%. In addition, we show the circumstances when a background thread can be run transparently without affecting the performance of the foreground thread.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "IBM POWER5; performance characterization; simultaneous multithreading; SMT; software-controlled prioritization", } @Article{Shye:2008:LLR, author = "Alex Shye and Berkin Ozisikyilmaz and Arindam Mallik and Gokhan Memik and Peter A. Dinda and Robert P. Dick and Alok N. Choudhary", title = "Learning and Leveraging the Relationship between Architecture-Level Measurements and Individual User Satisfaction", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "427--438", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382158", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The ultimate goal of computer design is to satisfy the end-user. In particular computing domains, such as interactive applications, there exists a variation in user expectations and user satisfaction relative to the performance of existing computer systems. In this work, we leverage this variation to develop more efficient architectures that are customized to end-users. We first investigate the relationship between microarchitectural parameters and user satisfaction. Specifically, we analyze the relationship between hardware performance counter (HPC) readings and individual satisfaction levels reported by users for representative applications. Our results show that the satisfaction of the user is strongly correlated to the performance of the underlying hardware. More importantly, the results show that user satisfaction is highly user-dependent. To take advantage of these observations, we develop a framework called Individualized Dynamic Voltage and Frequency Scaling (iDVFS). We study a group of users to characterize the relationship between the HPCs and individual user satisfaction levels. Based on this analysis, we use artificial neural networks to model the function from HPCs to user satisfaction for individual users. This model is then used online to predict user satisfaction and set the frequency level accordingly. A second set of user studies demonstrates that iDVFS reduces the CPU power consumption by over 25\% in representative applications as compared to the Windows XP DVFS algorithm.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "dynamic power management; hardware performance counters; learning user satisfaction; user-aware architectures", } @Article{Kumar:2008:AVO, author = "Sanjeev Kumar and Daehyun Kim and Mikhail Smelyanskiy and Yen-Kuang Chen and Jatin Chhugani and Christopher J. Hughes and Changkyu Kim and Victor W. Lee and Anthony D. Nguyen", title = "Atomic Vector Operations on Chip Multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "441--452", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382154", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The current trend is for processors to deliver dramatic improvements in parallel performance while only modestly improving serial performance. Parallel performance is harvested through vector/SIMD instructions as well as multithreading (through both multithreaded cores and chip multiprocessors). Vector parallelism can be more efficiently supported than multithreading, but is often harder for software to exploit. In particular, code with sparse data access patterns cannot easily utilize the vector/SIMD instructions of mainstream processors. Hardware to scatter and gather sparse data has previously been proposed to enable vector execution for these codes. However, on multithreaded architectures, a number of applications spend significant time on atomic operations (e.g., parallel reductions), which cannot be vectorized using previously proposed schemes. This paper proposes architectural support for atomic vector operations (referred to as GLSC) that addresses this limitation. GLSC extends scatter-gather hardware to support atomic memory operations. Our experiments show that the GLSC provides an average performance improvement on a set of important RMS kernels of 54\% for 4-wide SIMD.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "locks; multiprocessors; reductions; SIMD; vector", } @Article{Loh:2008:SMA, author = "Gabriel H. Loh", title = "{$3$D}-Stacked Memory Architectures for Multi-core Processors", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "453--464", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382159", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Three-dimensional integration enables stacking memory directly on top of a microprocessor, thereby significantly reducing wire delay between the two. Previous studies have examined the performance benefits of such an approach, but all of these works only consider commodity 2D DRAM organizations. In this work, we explore more aggressive 3D DRAM organizations that make better use of the additional die-to-die bandwidth provided by 3D stacking, as well as the additional transistor count. Our simulation results show that with a few simple changes to the 3D-DRAM organization, we can achieve a 1.75x speedup over previously proposed 3D-DRAM approaches on our memory-intensive multi-programmed workloads on a quad-core processor. The significant increase in memory system performance makes the L2 miss handling architecture (MHA) a new bottleneck, which we address by combining a novel data structure called the Vector Bloom Filter with dynamic MSHR capacity tuning. Our scalable L2 MHA yields an additional 17.8\% performance improvement over our 3D-stacked memory architecture.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "3D integration; memory; multi-core", } @Article{Anonymous:2008:AI, author = "Anonymous", title = "Author Index", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "465--466", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382160", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2008:PI, author = "Anonymous", title = "{Publisher}'s Information", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "468--468", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382161", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Anonymous:2008:CA, author = "Anonymous", title = "Cover Art", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "C1--C1", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382162", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Karne:2008:OSC, author = "Ramesh K. Karne and Alexander L. Wijesinha and George H. {Ford, Jr.}", title = "Opinion: stay on course with an evolution or choose a revolution in computing", journal = j-COMP-ARCH-NEWS, volume = "36", number = "4", pages = "1--6", month = sep, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1462609.1462611", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Dec 8 14:01:02 MST 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2008:INa, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "36", number = "4", pages = "7--11", month = sep, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1462609.1462613", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Dec 8 14:01:02 MST 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bengtsson:2008:DSA, author = "Jerker Bengtsson and Bertil Svensson", title = "A domain-specific approach for software development on {Manycore} platforms", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "2--10", month = dec, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1556444.1556446", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The programming complexity of increasingly parallel processors calls for new tools that assist programmers in utilising the parallel hardware resources. In this paper we present a set of models that we have developed as part of a tool for mapping dataflow graphs onto manycores. One of the models captures the essentials of manycores identified as suitable for signal processing, and which we use as target for our algorithms. As an intermediate representation we introduce timed configuration graphs, which describe the mapping of a model of an application onto a machine model. Moreover, we show how a timed configuration graph by very simple means can be evaluated using an abstract interpretation to obtain performance feedback. This information can be used by our tool and by the programmer in order to discover improved mappings.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cederman:2008:SLB, author = "Daniel Cederman and Philippas Tsigas", title = "On sorting and load balancing on {GPUs}", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "11--18", month = dec, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1556444.1556447", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper we take a look at GPU-Quicksort, an efficient Quicksort algorithm suitable for the highly parallel multi-core graphics processors. Quicksort had previously been considered an inefficient sorting solution for graphics processors, but GPU-Quicksort often performs better than the fastest known sorting implementations for graphics processors, such as radix and bitonic sort. Quicksort can thus be seen as a viable alternative for sorting large quantities of data on graphics processors.\par We also take look at a comparison of different load balancing schemes. To get maximum performance on the many-core graphics processors it is important to have an even balance of the workload so that all processing units contribute equally to the task at hand. This can be hard to achieve when the cost of a task is not known beforehand and when new sub-tasks are created dynamically during execution. With the recent advent of scatter operations and atomic hardware primitives it is now possible to bring some of the more elaborate dynamic load balancing schemes from the conventional SMP systems domain to the graphics processor domain.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ha:2008:NBP, author = "Phuong Hoai Ha and Philippas Tsigas and Otto J. Anshus", title = "Non-blocking programming on multi-core graphics processors: (extended abstract)", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "19--28", month = dec, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1556444.1556448", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper investigates the synchronization power of coalesced memory accesses, a family of memory access mechanisms introduced in recent large multicore architectures like the CUDA graphics processors. We first design three memory access models to capture the fundamental features of the new memory access mechanisms. Subsequently, we prove the exact synchronization power of these models in terms of their consensus numbers. These tight results show that the coalesced memory access mechanisms can facilitate strong synchronization between the threads of multicore processors, without the need of synchronization primitives other than reads and writes.\par Moreover, based on the intrinsic features of recent GPU architectures, we construct strong synchronization objects like wait-free and t-resilient read-modify-write objects for a general model of recent GPU architectures without strong hardware synchronization primitives like test-and-set and compare-and-swap. Accesses to the wait-free objects have time complexity $ O(N) $, where $N$ is the number of processes. Our result demonstrates that it is possible to construct waitfree synchronization mechanisms for GPUs without the need of strong synchronization primitives in hardware and that wait-free programming is possible for GPUs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bhattacharyya:2008:ODT, author = "Shuvra S. Bhattacharyya and Gordon Brebner and J{\"o}rn W. Janneck and Johan Eker and Carl von Platen and Marco Mattavelli and Micka{\"e}l Raulet", title = "{OpenDF}: a dataflow toolset for reconfigurable hardware and multicore systems", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "29--35", month = dec, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1556444.1556449", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents the OpenDF framework and recalls that dataflow programming was once invented to address the problem of parallel computing. We discuss the problems with an imperative style, von Neumann programs, and present what we believe are the advantages of using a dataflow programming model. The CAL actor language is briefly presented and its role in the ISO/MPEG standard is discussed. The Dataflow Interchange Format (DIF) and related tools can be used for analysis of actors and networks, demonstrating the advantages of a dataflow approach. Finally, an overview of a case study implementing an MPEG- 4 decoder is given.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kessler:2008:OCP, author = "Christoph W. Kessler and J{\"o}rg Keller", title = "Optimized on-chip pipelining of memory-intensive computations on the cell {BE}", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "36--45", month = dec, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1556444.1556450", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Multiprocessors-on-chip, such as the Cell BE processor, regularly suffer from restricted bandwidth to off-chip main memory. We propose to reduce memory bandwidth requirements, and thus increase performance, by expressing our application as a task graph, by running dependent tasks concurrently and by pipelining results directly from task to task where possible, instead of buffering in off-chip memory. To maximize bandwidth savings and balance load simultaneously, we solve a mapping problem of tasks to SPEs on the Cell BE. We present three approaches: an integer linear programming formulation that allows to compute Paretooptimal mappings for smaller task graphs, general heuristics, and a problem specific approximation algorithm. We validate the mappings for dataparallel computations and sorting.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lundvall:2008:APS, author = "H{\aa}kan Lundvall and Kristian Stav{\aa}ker and Peter Fritzson and Christoph Kessler", title = "Automatic parallelization of simulation code for equation-based models with software pipelining and measurements on three platforms", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "46--55", month = dec, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1556444.1556451", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this work we report results from a new integrated method of automatically generating parallel code from Modelica models by combining parallelization at two levels of abstraction. Performing inline expansion of a Runge--Kutta solver combined with fine-grained automatic parallelization of the right-hand side of the resulting equation system opens up new possibilities for generating high performance code, which is becoming increasingly relevant when multi-core computers are becoming commonplace. An implementation, in the form of a backend module for the OpenModelica compiler, has been developed and used for measurements on two architectures: Intel Xeon and SGI Altix 3700 Bx2. This paper also contains some very recent results of a prototype implementation of this parallelization approach on the Cell BE processor architecture.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fang:2008:SDA, author = "Huan Fang and Mats Brorsson", title = "Scalable directory architecture for distributed shared memory chip multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "56--64", month = dec, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1556444.1556452", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Traditional Directory-based cache coherence protocol is far from optimal for large-scale cache coherent shared memory multiprocessors due to the increasing latency to access directories stored in DRAM memory. Instead of keeping directories in main memory, we consider distributing the directory together with L2 cache across all nodes on a Chip Multiprocessor. Each node contains a processing unit, a private L1 cache, a slice of L2 cache, memory controller and a router. Both L2 cache and memories are distributed shared and interleaved by a subset of memory address bits. All nodes are interconnected through a low latency two dimensional Mesh network. Directory, being a split component to L2 cache, only stores sharing information for blocks while L2 cache stores only data blocks exclusive with L1 cache. Shared L2 cache can increase total effective cache capacity on chip, but also increase the miss latency when data is on a remote node. Being different from Directory Cache structure, our proposal totally removes the directory from memory, which saves memory space and reduces access latency. Compared to L2 cache that combines directory information internally, our L2 cache structure saves up to 88\% cache space and achieves similar performance.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jonsson:2008:SSE, author = "Bengt Jonsson", title = "State-space exploration for concurrent algorithms under weak memory orderings: (preliminary version)", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "65--71", month = dec, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1556444.1556453", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Several concurrent implementations of familiar data abstractions such as queues, sets, or maps typically do not follow locking disciplines, and often use lock-free synchronization to gain performance. Since such algorithms are exposed to a weak memory model, they are notoriously hard to get correct, as witnessed by many bugs found in published algorithms. We outline a technique for analyzing correctness of concurrent algorithms under weak memory models, in which a model checker is used to search for correctness violations. The algorithm to be analyzed is transformed into a form where statements may be reordered according to a particular weak memory ordering. The transformed algorithm can then be analyzed by a model-checking tool, e.g., by enumerative state exploration. We illustrate the approach on a small example of a queue, which allows an enqueue operation to be concurrent with a dequeue operation, which we analyze with respect to the RMO memory model defined in SPARC v9.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Abdulla:2008:MCR, author = "Parosh Aziz Abdulla and Fr{\'e}d{\'e}ric Haziza and Mats Kindahl", title = "Model checking race-freeness", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "72--79", month = dec, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1556444.1556454", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With the introduction of highly concurrent systems in standard desktop computers, ensuring correctness of industrial-size concurrent programs is becoming increasingly important. One of the most important standards in use for developing multi-threaded programs is the POSIX Threads standard, commonly known as PThreads. Of particular importance, the analysis of industrial code should, as far as possible, be automatic and not require annotations or other forms of specifications of the code.\par Model checking has been one of the most successful approaches to program verification during the last two decades. The size and complexity of applications which can be handled have increased rapidly through integration with symbolic techniques. These methods are designed to work on finite (but large) state spaces. This framework fails to deal with several essential aspects of behaviours for multithreaded programs: there is no bound a priori on the number of threads which may arise in a given run of the system; each thread manipulates local variables which often range over unbounded domains; and the system has a dynamic structure in the sense that threads can be created and killed throughout execution of the system. In this paper we concentrate on checking a particular class of properties for concurrent programs, namely safety properties. In particular, we focus on race-freeness, that is, the absence of race conditions (also known as data races) in shared-variable pthreaded programs.\par We will follow a particular methodology which we have earlier developed for model checking general classes of infinite-state systems [1, 3, 6, 8, 9] and apply a symbolic backward reachability analysis to verify the safety property. Since we construct a model as an over-approximation of the original program, proving the safety property in the model implies that the property also holds in the original system. Surprisingly, it leads to a quite efficient analysis which can be carried out fully automatically.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sundell:2008:NNB, author = "Hakan Sundell and Philippas Tsigas", title = "{NOBLE}: non-blocking programming support via lock-free shared abstract data types", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "80--87", month = dec, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1556444.1556455", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "An essential part of programming for multi-core and multi-processor includes efficient and reliable means for sharing data. Lock-free data structures are known as very suitable for this purpose, although experienced to be very complex to design. In this paper, we present a software library of non-blocking abstract data types that have been designed to facilitate lock-free programming for non-experts. The system provides: (i) efficient implementations of the most commonly used data types in concurrent and sequential software design, (ii) a lock-free memory management system, and (iii) a run time-system. The library provides clear semantics that are at least as strong as those of corresponding lock-based implementations of the respective data types. Our software library can be used for facilitating lockfree programming; its design enables the programmer to: (i) replace lock-based components of sequential or parallel code easily and efficiently , (ii) use well-tuned concurrent algorithms inside a software or hardware transactional system. In the paper we describe the design and functionality of the system. We also provide experimental results that show that the library can considerably improve the performance of software systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gidenstam:2008:LLF, author = "Anders Gidenstam and Marina Papatriantafilou", title = "{LFTHREADS}: a lock-free thread library", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "88--92", month = dec, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1556444.1556456", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This extended abstract presents LFTHREADS, a thread library entirely based on lock-free methods, i.e. no spinlocks or similar synchronization mechanisms are employed in the implementation of the multithreading. Since lockfreedom is highly desirable in multiprocessors/multicores due to its advantages in parallelism, fault-tolerance, convoy-avoidance and more, there is an increased demand in lock-free methods in parallel applications, hence also in multiprocessor/multicore system services. LFTHREADS is the first thread library that provides a lock-free implementation of blocking synchronization primitives for application threads; although the latter may sound like a contradicting goal, such objects have several benefits: e.g. library operations that block and unblock threads on the same synchronization object can make progress in parallel while maintaining the desired thread-level semantics and without having to wait for any 'low' operations among them. Besides, as no spin-locks or similar synchronization mechanisms are employed, memory contention can be reduced and processors/cores are able to do useful work. As a consequence, applications, too, can enjoy enhanced parallelism and fault-tolerance. For the synchronization in LFTHREADS we have introduced a new method, which we call responsibility hand-off (RHO), that does not need any special kernel support. The RHO method is also of independent interest, as it can also serve as a tool for lock-free token passing, management of contention and interaction between scheduling and synchronization. This paper gives an outline and the context of LFTHREADS. For more details the reader is referred to [7] and [8].", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Faxen:2008:WWS, author = "Karl-Filip Fax{\'e}n", title = "{Wool} --- a work stealing library", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "93--100", month = dec, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1556444.1556457", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents some preliminary results on a small light weight user level task management library called Wool. The Wool task scheduler is based on work stealing. The objective of the library is to provide a reasonably convenient programming interface (in particular by not forcing the programmer to write in continuation passing style) in ordinary C while still having a very low task creation overhead. Several task scheduling systems based on work stealing exists, but they are typically either programming languages like Cilk-5 or based on C++ like the Intel TBB or C\# as in the Microsoft TPL. Our main conclusions are that such a direct style interface is indeed possible and yields performance that is comparable to that of the Intel TBB.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2008:INb, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "101--111", month = dec, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1556444.1556459", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } %%% TO DO: [26-Jun-2009] Volume 36 number 6: no data yet in ACM Portal database @Article{Gebhart:2009:ETC, author = "Mark Gebhart and Bertrand A. Maher and Katherine E. Coons and Jeff Diamond and Paul Gratz and Mario Marino and Nitya Ranganathan and Behnam Robatmili and Aaron Smith and James Burrill and Stephen W. Keckler and Doug Burger and Kathryn S. McKinley", title = "An evaluation of the {TRIPS} computer system", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "1--12", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508246", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The TRIPS system employs a new instruction set architecture (ISA) called Explicit Data Graph Execution (EDGE) that renegotiates the boundary between hardware and software to expose and exploit concurrency. EDGE ISAs use a block-atomic execution model in which blocks are composed of dataflow instructions. The goal of the TRIPS design is to mine concurrency for high performance while tolerating emerging technology scaling challenges, such as increasing wire delays and power consumption. This paper evaluates how well TRIPS meets this goal through a detailed ISA and performance analysis. We compare performance, using cycles counts, to commercial processors. On SPEC CPU2000, the Intel Core 2 outperforms compiled TRIPS code in most cases, although TRIPS matches a Pentium 4. On simple benchmarks, compiled TRIPS code outperforms the Core 2 by 10\% and hand-optimized TRIPS code outperforms it by factor of 3. Compared to conventional ISAs, the block-atomic model provides a larger instruction window, increases concurrency at a cost of more instructions executed, and replaces register and memory accesses with more efficient direct instruction-to-instruction communication. Our analysis suggests ISA, microarchitecture, and compiler enhancements for addressing weaknesses in TRIPS and indicates that EDGE architectures have the potential to exploit greater concurrency in future technologies.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Pistol:2009:AIN, author = "Constantin Pistol and Wutichai Chongchitmate and Christopher Dwyer and Alvin R. Lebeck", title = "Architectural implications of nanoscale integrated sensing and computing", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "13--24", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508247", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper explores the architectural implications of integrating computation and molecular probes to form nanoscale sensor processors (nSP). We show how nSPs may enable new computing domains and automate tasks that currently require expert scientific training and costly equipment. This new application domain severely constrains nSP size, which significantly impacts the architectural design space. In this context, we explore nSP architectures and present an nSP design that includes a simple accumulator-based ISA, sensors, limited memory and communication transceivers. To reduce the application memory footprint, we introduce the concept of instruction-fused sensing. We use simulation and analytical models to evaluate nSP designs executing a representative set of target applications. Furthermore, we propose a candidate nSP technology based on optical Resonance Energy Transfer (RET) logic that enables the small size required by the application domain; our smallest design is about the size of the largest known virus. We also show laboratory results that demonstrate initial steps towards a prototype.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Park:2009:CEA, author = "Soyeon Park and Shan Lu and Yuanyuan Zhou", title = "{CTrigger}: exposing atomicity violation bugs from their hiding places", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "25--36", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508249", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Multicore hardware is making concurrent programs pervasive. Unfortunately, concurrent programs are prone to bugs. Among different types of concurrency bugs, atomicity violation bugs are common and important. Existing techniques to detect atomicity violation bugs suffer from one limitation: requiring bugs to manifest during monitored runs, which is an open problem in concurrent program testing. This paper makes two contributions. First, it studies the interleaving characteristics of the common practice in concurrent program testing (i.e., running a program over and over) to understand why atomicity violation bugs are hard to expose. Second, it proposes CTrigger to effectively and efficiently expose atomicity violation bugs in large programs. CTrigger focuses on a special type of interleavings (i.e., unserializable interleavings) that are inherently correlated to atomicity violation bugs, and uses trace analysis to systematically identify (likely) feasible unserializable interleavings with low occurrence-probability. CTrigger then uses minimum execution perturbation to exercise low-probability interleavings and expose difficult-to-catch atomicity violation. We evaluate CTrigger with real-world atomicity violation bugs from four sever/desktop applications (Apache, MySQL, Mozilla, and PBZIP2) and three SPLASH2 applications on 8-core machines. CTrigger efficiently exposes the tested bugs within 1--235 seconds, two to four orders of magnitude faster than stress testing. Without CTrigger, some of these bugs do not manifest even after 7 full days of stress testing. In addition, without deterministic replay support, once a bug is exposed, CTrigger can help programmers reliably reproduce it for diagnosis. Our tested bugs are reproduced by CTrigger mostly within 5 seconds, 300 to over 60000 times faster than stress testing.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Sidiroglou:2009:AAS, author = "Stelios Sidiroglou and Oren Laadan and Carlos Perez and Nicolas Viennot and Jason Nieh and Angelos D. Keromytis", title = "{ASSURE}: automatic software self-healing using rescue points", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "37--48", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508250", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Software failures in server applications are a significant problem for preserving system availability. We present ASSURE, a system that introduces rescue points that recover software from unknown faults while maintaining both system integrity and availability, by mimicking system behavior under known error conditions. Rescue points are locations in existing application code for handling a given set of programmer-anticipated failures, which are automatically repurposed and tested for safely enabling fault recovery from a larger class of (unanticipated) faults. When a fault occurs at an arbitrary location in the program, ASSURE restores execution to an appropriate rescue point and induces the program to recover execution by virtualizing the program's existing error-handling facilities. Rescue points are identified using fuzzing, implemented using a fast coordinated checkpoint-restart mechanism that handles multi-process and multi-threaded applications, and, after testing, are injected into production code using binary patching. We have implemented an ASSURE Linux prototype that operates without application source code and without base operating system kernel changes. Our experimental results on a set of real-world server applications and bugs show that ASSURE enabled recovery for all of the bugs tested with fast recovery times, has modest performance overhead, and provides automatic self-healing orders of magnitude faster than current human-driven patch deployment methods.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Lenharth:2009:RDO, author = "Andrew Lenharth and Vikram S. Adve and Samuel T. King", title = "Recovery domains: an organizing principle for recoverable operating systems", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "49--60", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508251", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We describe a strategy for enabling existing commodity operating systems to recover from unexpected run-time errors in nearly any part of the kernel, including core kernel components. Our approach is dynamic and request-oriented; it isolates the effects of a fault to the requests that caused the fault rather than to static kernel components. This approach is based on a notion of ``recovery domains,'' an organizing principle to enable rollback of state affected by a request in a multithreaded system with minimal impact on other requests or threads. We have applied this approach on v2.4.22 and v2.6.27 of the Linux kernel and it required 132 lines of changed or new code: the other changes are all performed by a simple instrumentation pass of a compiler. Our experiments show that the approach is able to recover from otherwise fatal faults with minimal collateral impact during a recovery event.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Dimitrov:2009:ABB, author = "Martin Dimitrov and Huiyang Zhou", title = "Anomaly-based bug prediction, isolation, and validation: an automated approach for software debugging", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "61--72", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508252", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Software defects, commonly known as bugs, present a serious challenge for system reliability and dependability. Once a program failure is observed, the debugging activities to locate the defects are typically nontrivial and time consuming. In this paper, we propose a novel automated approach to pin-point the root-causes of software failures. Our proposed approach consists of three steps. The first step is bug prediction, which leverages the existing work on anomaly-based bug detection as exceptional behavior during program execution has been shown to frequently point to the root cause of a software failure. The second step is bug isolation, which eliminates false-positive bug predictions by checking whether the dynamic forward slices of bug predictions lead to the observed program failure. The last step is bug validation, in which the isolated anomalies are validated by dynamically nullifying their effects and observing if the program still fails. The whole bug prediction, isolation and validation process is fully automated and can be implemented with efficient architectural support. Our experiments with 6 programs and 7 bugs, including a real bug in the gcc 2.95.2 compiler, show that our approach is highly effective at isolating only the relevant anomalies. Compared to state-of-art debugging techniques, our proposed approach pinpoints the defect locations more accurately and presents the user with a much smaller code set to analyze.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Montesinos:2009:CSH, author = "Pablo Montesinos and Matthew Hicks and Samuel T. King and Josep Torrellas", title = "{Capo}: a software-hardware interface for practical deterministic multiprocessor replay", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "73--84", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508254", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "While deterministic replay of parallel programs is a powerful technique, current proposals have shortcomings. Specifically, software-based replay systems have high overheads on multiprocessors, while hardware-based proposals focus only on basic hardware-level mechanisms, ignoring the overall replay system. To be practical, hardware-based replay systems need to support an environment with multiple parallel jobs running concurrently --- some being recorded, others being replayed and even others running without recording or replay. Moreover, they need to manage limited-size log buffers. This paper addresses these shortcomings by introducing, for the first time, a set of abstractions and a software-hardware interface for practical hardware-assisted replay of multiprocessor systems. The approach, called Capo, introduces the novel abstraction of the Replay Sphere to separate the responsibilities of the hardware and software components of the replay system. In this paper, we also design and build CapoOne, a prototype of a deterministic multiprocessor replay system that implements Capo using Linux and simulated DeLorean hardware. Our evaluation of 4-processor executions shows that CapoOne largely records with the efficiency of hardware-based schemes and the flexibility of software-based schemes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Devietti:2009:DDS, author = "Joseph Devietti and Brandon Lucia and Luis Ceze and Mark Oskin", title = "{DMP}: deterministic shared memory multiprocessing", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "85--96", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508255", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Current shared memory multicore and multiprocessor systems are nondeterministic. Each time these systems execute a multithreaded application, even if supplied with the same input, they can produce a different output. This frustrates debugging and limits the ability to properly test multithreaded code, becoming a major stumbling block to the much-needed widespread adoption of parallel programming. In this paper we make the case for fully deterministic shared memory multiprocessing (DMP). The behavior of an arbitrary multithreaded program on a DMP system is only a function of its inputs. The core idea is to make inter-thread communication fully deterministic. Previous approaches to coping with nondeterminism in multithreaded programs have focused on replay, a technique useful only for debugging. In contrast, while DMP systems are directly useful for debugging by offering repeatability by default, we argue that parallel programs should execute deterministically in the field as well. This has the potential to make testing more assuring and increase the reliability of deployed multithreaded software. We propose a range of approaches to enforcing determinism and discuss their implementation trade-offs. We show that determinism can be provided with little performance cost using our architecture proposals on future hardware, and that software-only approaches can be utilized on existing systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Olszewski:2009:KED, author = "Marek Olszewski and Jason Ansel and Saman Amarasinghe", title = "{Kendo}: efficient deterministic multithreading in software", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "97--108", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508256", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Although chip-multiprocessors have become the industry standard, developing parallel applications that target them remains a daunting task. Non-determinism, inherent in threaded applications, causes significant challenges for parallel programmers by hindering their ability to create parallel applications with repeatable results. As a consequence, parallel applications are significantly harder to debug, test, and maintain than sequential programs. This paper introduces Kendo: a new software-only system that provides deterministic multithreading of parallel applications. Kendo enforces a deterministic interleaving of lock acquisitions and specially declared non-protected reads through a novel dynamically load-balanced deterministic scheduling algorithm. The algorithm tracks the progress of each thread using performance counters to construct a deterministic logical time that is used to compute an interleaving of shared data accesses that is both deterministic and provides good load balancing. Kendo can run on today's commodity hardware while incurring only a modest performance cost. Experimental results on the SPLASH-2 applications yield a geometric mean overhead of only 16\% when running on 4 processors. This low overhead makes it possible to benefit from Kendo even after an application is deployed. Programmers can start using Kendo today to program parallel applications that are easier to develop, debug, and test.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Tiwari:2009:CIF, author = "Mohit Tiwari and Hassan M. G. Wassel and Bita Mazloom and Shashidhar Mysore and Frederic T. Chong and Timothy Sherwood", title = "Complete information flow tracking from the gates up", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "109--120", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508258", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "For many mission-critical tasks, tight guarantees on the flow of information are desirable, for example, when handling important cryptographic keys or sensitive financial data. We present a novel architecture capable of tracking all information flow within the machine, including all explicit data transfers and all implicit flows (those subtly devious flows caused by not performing conditional operations). While the problem is impossible to solve in the general case, we have created a machine that avoids the general-purpose programmability that leads to this impossibility result, yet is still programmable enough to handle a variety of critical operations such as public-key encryption and authentication. Through the application of our novel gate-level information flow tracking method, we show how all flows of information can be precisely tracked. From this foundation, we then describe how a class of architectures can be constructed, from the gates up, to completely capture all information flows and we measure the impact of doing so on the hardware implementation, the ISA, and the programmer.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Tam:2009:RAL, author = "David K. Tam and Reza Azimi and Livio B. Soares and Michael Stumm", title = "{RapidMRC}: approximating {L2} miss rate curves on commodity systems for online optimizations", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "121--132", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508259", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Miss rate curves (MRCs) are useful in a number of contexts. In our research, online L2 cache MRCs enable us to dynamically identify optimal cache sizes when cache-partitioning a shared-cache multicore processor. Obtaining L2 MRCs has generally been assumed to be expensive when done in software and consequently, their usage for online optimizations has been limited. To address these problems and opportunities, we have developed a low-overhead software technique to obtain L2 MRCs online on current processors, exploiting features available in their performance monitoring units so that no changes to the application source code or binaries are required. Our technique, called RapidMRC, requires a single probing period of roughly 221 million processor cycles (147 ms), and subsequently 124 million cycles (83 ms) to process the data. We demonstrate its accuracy by comparing the obtained MRCs to the actual L2 MRCs of 30 applications taken from SPECcpu2006, SPECcpu2000, and SPECjbb2000. We show that RapidMRC can be applied to sizing cache partitions, helping to achieve performance improvements of up to 27\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Eyerman:2009:PTC, author = "Stijn Eyerman and Lieven Eeckhout", title = "Per-thread cycle accounting in {SMT} processors", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "133--144", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508260", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper proposes a cycle accounting architecture for Simultaneous Multithreading (SMT) processors that estimates the execution times for each of the threads had they been executed alone, while they are running simultaneously on the SMT processor. This is done by accounting each cycle to either a base, miss event or waiting cycle component during multi-threaded execution. Single-threaded alone execution time is then estimated as the sum of the base and miss event components; the waiting cycle component represents the lost cycle count due to SMT execution. The cycle accounting architecture incurs reasonable hardware cost (around 1KB of storage) and estimates single-threaded performance with average prediction errors around 7.2\% for two-program workloads and 11.7\% for four-program workloads. The cycle accounting architecture has several important applications to system software and its interaction with SMT hardware. For one, the estimated single-thread alone execution time provides an accurate picture to system software of the actually consumed processor cycles per thread. The alone execution time instead of the total execution time (timeslice) may make system software scheduling policies more effective. Second, a new class of thread-progress aware SMT fetch policies based on per-thread progress indicators enable system software level priorities to be enforced at the hardware level.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Hofmann:2009:MBM, author = "Owen S. Hofmann and Christopher J. Rossbach and Emmett Witchel", title = "Maximum benefit from a minimal {HTM}", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "145--156", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508262", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A minimal, bounded hardware transactional memory implementation significantly improves synchronization performance when used in an operating system kernel. We add HTM to Linux 2.4, a kernel with a simple, coarse-grained synchronization structure. The transactional Linux 2.4 kernel can improve performance of user programs by as much as 40\% over the non-transactional 2.4 kernel. It closes 68\% of the performance gap with the Linux 2.6 kernel, which has had significant engineering effort applied to improve scalability. We then extend our minimal HTM to a fast, unbounded transactional memory with a novel technique for coordinating hardware transactions and software synchronization. Overflowed transactions run in software, with only a minimal coupling between hardware and software systems. There is no performance penalty for overflow rates of less than 1\%. In one instance, at 16 processors and an overflow rate of 4\%, performance degrades from an ideal 4.3x to 3.6x.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Dice:2009:EEC, author = "Dave Dice and Yossi Lev and Mark Moir and Daniel Nussbaum", title = "Early experience with a commercial hardware transactional memory implementation", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "157--168", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508263", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We report on our experience with the hardware transactional memory (HTM) feature of two pre-production revisions of a new commercial multicore processor. Our experience includes a number of promising results using HTM to improve performance in a variety of contexts, and also identifies some ways in which the feature could be improved to make it even better. We give detailed accounts of our experiences, sharing techniques we used to achieve the results we have, as well as describing challenges we faced in doing so.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Wells:2009:MMM, author = "Philip M. Wells and Koushik Chakraborty and Gurindar S. Sohi", title = "Mixed-mode multicore reliability", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "169--180", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508265", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Future processors are expected to observe increasing rates of hardware faults. Using Dual-Modular Redundancy (DMR), two cores of a multicore can be loosely coupled to redundantly execute a single software thread, providing very high coverage from many difference sources of faults. This reliability, however, comes at a high price in terms of per-thread IPC and overall system throughput. We make the observation that a user may want to run both applications requiring high reliability, such as financial software, and more fault tolerant applications requiring high performance, such as media or web software, on the same machine at the same time. Yet a traditional DMR system must fully operate in redundant mode whenever any application requires high reliability. This paper proposes a Mixed-Mode Multicore (MMM), which enables most applications, including the system software, to run with high reliability in DMR mode, while applications that need high performance can avoid the penalty of DMR. Though conceptually simple, two key challenges arise: (1) care must be taken to protect reliable applications from any faults occurring to applications running in high performance mode, and (2) the desire to execute additional independent software threads for a performance application complicates the scheduling of computation to cores. After solving these issues, an MMM is shown to improve overall system performance, compared to a traditional DMR system, by approximately 2X when one reliable and one performance application are concurrently executing.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Rajamani:2009:IDE, author = "Sriram Rajamani and G. Ramalingam and Venkatesh Prasad Ranganath and Kapil Vaswani", title = "{ISOLATOR}: dynamically ensuring isolation in comcurrent programs", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "181--192", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508266", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper, we focus on concurrent programs that use locks to achieve isolation of data accessed by critical sections of code. We present ISOLATOR, an algorithm that guarantees isolation for well-behaved threads of a program that obey a locking discipline even in the presence of ill-behaved threads that disobey the locking discipline. ISOLATOR uses code instrumentation, data replication, and virtual memory protection to detect isolation violations and delays ill-behaved threads to ensure isolation. Our instrumentation scheme requires access only to the code of well-behaved threads. We have evaluated ISOLATOR on several benchmark programs and found that ISOLATOR can ensure isolation with reasonable runtime overheads. In addition, we present three general desiderata --- safety, isolation, and permissiveness --- for any scheme that attempts to ensure isolation, and formally prove that ISOLATOR satisfies all of these desiderata.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Tucek:2009:EOV, author = "Joseph Tucek and Weiwei Xiong and Yuanyuan Zhou", title = "Efficient online validation with delta execution", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "193--204", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508267", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Software systems are constantly changing. Patches to fix bugs and patches to add features are all too common. Every change risks breaking a previously working system. Hence administrators loathe change, and are willing to delay even critical security patches until after fully validating their correctness. Compared to off-line validation, on-line validation has clear advantages since it tests against real life workloads. Yet unfortunately it imposes restrictive overheads as it requires running the old and new versions side-by-side. Moreover, due to spurious differences (e.g. event timing, random number generation, and thread interleavings), it is difficult to compare the two for validation. To allow more effective on-line patch validation, we propose a new mechanism, called delta execution, that is based on the observation that most patches are small. Delta execution merges the two side-by-side executions for most of the time and splits only when necessary, such as when they access different data or execute different code. This allows us to perform on-line validation not only with lower overhead but also with greatly reduced spurious differences, allowing us to effectively validate changes. We first validate the feasibility of our idea by studying the characteristics of 240 patches from 4 server programs; our examination shows that 77\% of the changes should not be expected to cause large changes and are thereby feasible for Delta execution. We then implemented Delta execution using dynamic instrumentation. Using real world patches from 7 server applications and 3 other programs, we compared our implementation of Delta execution against a traditional side-by-side on-line validation. Delta execution outperformed traditional validation by up to 128\%; further, for 3 of the changes, spurious differences caused the traditional validation to fail completely while Delta execution succeeded. This demonstrates that Delta execution can allow administrators to use on-line validation to confidently ensure the correctness of the changes they apply.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Meisner:2009:PES, author = "David Meisner and Brian T. Gold and Thomas F. Wenisch", title = "{PowerNap}: eliminating server idle power", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "205--216", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508269", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Data center power consumption is growing to unprecedented levels: the EPA estimates U.S. data centers will consume 100 billion kilowatt hours annually by 2011. Much of this energy is wasted in idle systems: in typical deployments, server utilization is below 30\%, but idle servers still consume 60\% of their peak power draw. Typical idle periods though frequent--last seconds or less, confounding simple energy-conservation approaches. In this paper, we propose PowerNap, an energy-conservation approach where the entire system transitions rapidly between a high-performance active state and a near-zero-power idle state in response to instantaneous load. Rather than requiring fine-grained power-performance states and complex load-proportional operation from each system component, PowerNap instead calls for minimizing idle power and transition time, which are simpler optimization goals. Based on the PowerNap concept, we develop requirements and outline mechanisms to eliminate idle power waste in enterprise blade servers. Because PowerNap operates in low-efficiency regions of current blade center power supplies, we introduce the Redundant Array for Inexpensive Load Sharing (RAILS), a power provisioning approach that provides high conversion efficiency across the entire range of PowerNap's power demands. Using utilization traces collected from enterprise-scale commercial deployments, we demonstrate that, together, PowerNap and RAILS reduce average server power consumption by 74\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Caulfield:2009:GUF, author = "Adrian M. Caulfield and Laura M. Grupp and Steven Swanson", title = "{Gordon}: using flash memory to build fast, power-efficient clusters for data-intensive applications", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "217--228", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508270", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As our society becomes more information-driven, we have begun to amass data at an astounding and accelerating rate. At the same time, power concerns have made it difficult to bring the necessary processing power to bear on querying, processing, and understanding this data. We describe Gordon, a system architecture for data-centric applications that combines low-power processors, flash memory, and data-centric programming systems to improve performance for data-centric applications while reducing power consumption. The paper presents an exhaustive analysis of the design space of Gordon systems, focusing on the trade-offs between power, energy, and performance that Gordon must make. It analyzes the impact of flash-storage and the Gordon architecture on the performance and power efficiency of data-centric applications. It also describes a novel flash translation layer tailored to data intensive workloads and large flash storage arrays. Our data show that, using technologies available in the near future, Gordon systems can out-perform disk-based clusters by 1.5$ \times $ and deliver up to 2.5$ \times $ more performance per Watt.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Gupta:2009:DFT, author = "Aayush Gupta and Youngjae Kim and Bhuvan Urgaonkar", title = "{DFTL}: a flash translation layer employing demand-based selective caching of page-level address mappings", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "229--240", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508271", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Recent technological advances in the development of flash-memory based devices have consolidated their leadership position as the preferred storage media in the embedded systems market and opened new vistas for deployment in enterprise-scale storage systems. Unlike hard disks, flash devices are free from any mechanical moving parts, have no seek or rotational delays and consume lower power. However, the internal idiosyncrasies of flash technology make its performance highly dependent on workload characteristics. The poor performance of random writes has been a cause of major concern, which needs to be addressed to better utilize the potential of flash in enterprise-scale environments. We examine one of the important causes of this poor performance: the design of the Flash Translation Layer (FTL), which performs the virtual-to-physical address translations and hides the erase-before-write characteristics of flash. We propose a complete paradigm shift in the design of the core FTL engine from the existing techniques with our Demand-based Flash Translation Layer (DFTL), which selectively caches page-level address mappings. We develop a flash simulation framework called FlashSim. Our experimental evaluation with realistic enterprise-scale workloads endorses the utility of DFTL in enterprise-scale storage systems by demonstrating: (i) improved performance, (ii) reduced garbage collection overhead and (iii) better overload behavior compared to state-of-the-art FTL schemes. For example, a predominantly random-write dominant I/O trace from an OLTP application running at a large financial institution shows a 78\% improvement in average response time (due to a 3-fold reduction in operations of the garbage collector), compared to a state-of-the-art FTL scheme. Even for the well-known read-dominant TPC-H benchmark, for which DFTL introduces additional overheads, we improve system response time by 56\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Aleen:2009:CAS, author = "Farhana Aleen and Nathan Clark", title = "Commutativity analysis for software parallelization: letting program transformations see the big picture", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "241--252", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508273", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Extracting performance from many-core architectures requires software engineers to create multi-threaded applications, which significantly complicates the already daunting task of software development. One solution to this problem is automatic compile-time parallelization, which can ease the burden on software developers in many situations. Clearly, automatic parallelization in its present form is not suitable for many application domains and new compiler analyses are needed address its shortcomings. In this paper, we present one such analysis: a new approach for detecting commutative functions. Commutative functions are sections of code that can be executed in any order without affecting the outcome of the application, e.g., inserting elements into a set. Previous research on this topic had one significant limitation, in that the results of a commutative functions must produce identical memory layouts. This prevented previous techniques from detecting functions like malloc, which may return different pointers depending on the order in which it is called, but these differing results do not affect the overall output of the application. Our new commutativity analysis correctly identify these situations to better facilitate automatic parallelization. We demonstrate that this analysis can automatically extract significant amounts of parallelism from many applications, and where it is ineffective it can provide software developers a useful list of functions that may be commutative provided semantic program changes that are not automatable.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Suleman:2009:ACS, author = "M. Aater Suleman and Onur Mutlu and Moinuddin K. Qureshi and Yale N. Patt", title = "Accelerating critical section execution with asymmetric multi-core architectures", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "253--264", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508274", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "To improve the performance of a single application on Chip Multiprocessors (CMPs), the application must be split into threads which execute concurrently on multiple cores. In multi-threaded applications, critical sections are used to ensure that only one thread accesses shared data at any given time. Critical sections can serialize the execution of threads, which significantly reduces performance and scalability. This paper proposes Accelerated Critical Sections (ACS), a technique that leverages the high-performance core(s) of an Asymmetric Chip Multiprocessor (ACMP) to accelerate the execution of critical sections. In ACS, selected critical sections are executed by a high-performance core, which can execute the critical section faster than the other, smaller cores. As a result, ACS reduces serialization: it lowers the likelihood of threads waiting for a critical section to finish. Our evaluation on a set of 12 critical-section-intensive workloads shows that ACS reduces the average execution time by 34\% compared to an equal-area 32T-core symmetric CMP and by 23\% compared to an equal-area ACMP. Moreover, for 7 out of the 12 workloads, ACS improves scalability by increasing the number of threads at which performance saturates.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Mytkowicz:2009:PWD, author = "Todd Mytkowicz and Amer Diwan and Matthias Hauswirth and Peter F. Sweeney", title = "Producing wrong data without doing anything obviously wrong!", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "265--276", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508275", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents a surprising result: changing a seemingly innocuous aspect of an experimental setup can cause a systems researcher to draw wrong conclusions from an experiment. What appears to be an innocuous aspect in the experimental setup may in fact introduce a significant bias in an evaluation. This phenomenon is called measurement bias in the natural and social sciences. Our results demonstrate that measurement bias is significant and commonplace in computer system evaluation. By significant we mean that measurement bias can lead to a performance analysis that either over-states an effect or even yields an incorrect conclusion. By commonplace we mean that measurement bias occurs in all architectures that we tried (Pentium 4, Core 2, and m5 O3CPU), both compilers that we tried (gcc and Intel's C compiler), and most of the SPEC CPU2006 C programs. Thus, we cannot ignore measurement bias. Nevertheless, in a literature survey of 133 recent papers from ASPLOS, PACT, PLDI, and CGO, we determined that none of the papers with experimental results adequately consider measurement bias. Inspired by similar problems and their solutions in other sciences, we describe and demonstrate two methods, one for detecting (causal analysis) and one for avoiding (setup randomization) measurement bias.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Bond:2009:LP, author = "Michael D. Bond and Kathryn S. McKinley", title = "Leak pruning", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "277--288", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508277", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Managed languages improve programmer productivity with type safety and garbage collection, which eliminate memory errors such as dangling pointers, double frees, and buffer overflows. However, because garbage collection uses reachability to over-approximate live objects, programs may still leak memory if programmers forget to eliminate the last reference to an object that will not be used again. Leaks slow programs by increasing collector workload and frequency. Growing leaks eventually crash programs. This paper introduces leak pruning, which keeps programs running by predicting and reclaiming leaked objects at run time. It predicts dead objects and reclaims them based on observing data structure usage patterns. Leak pruning preserves semantics because it waits for heap exhaustion before reclaiming objects and poisons references to objects it reclaims. If the program later tries to access a poisoned reference, the virtual machine (VM) throws an error. We show leak pruning has low overhead in a Java VM and evaluate it on 10 leaking programs. Leak pruning does not help two programs, executes five substantial programs 1.6-81X longer, and executes three programs, including a leak in Eclipse, for at least 24 hours. In the worst case, leak pruning defers fatal errors. In the best case, it keeps leaky programs running with preserved semantics and consistent throughput.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Wegiel:2009:DPC, author = "Michal Wegiel and Chandra Krintz", title = "Dynamic prediction of collection yield for managed runtimes", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "289--300", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508278", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The growth in complexity of modern systems makes it increasingly difficult to extract high-performance. The software stacks for such systems typically consist of multiple layers and include managed runtime environments (MREs). In this paper, we investigate techniques to improve cooperation between these layers and the hardware to increase the efficacy of automatic memory management in MREs. General-purpose MREs commonly implement parallel and/or concurrent garbage collection and employ compaction to eliminate heap fragmentation. Moreover, most systems trigger collection based on the amount of heap a program uses. Our analysis shows that in many cases this strategy leads to ineffective collections that are unable to reclaim sufficient space to justify the incurred cost. To avoid such collections, we exploit the observation that dead objects tend to cluster together and form large, never-referenced, regions in the address space that correlate well with virtual pages that have not recently been referenced by the application. We leverage this correlation to design a new, simple and light-weight, yield predictor that estimates the amount of reclaimable space in the heap using hardware page reference bits. Our predictor allows MREs to avoid low-yield collections and thereby improve resource management. We integrate this predictor into three state-of-the-art parallel compactors, implemented in the HotSpot JVM, that represent distinct canonical heap layouts. Our empirical evaluation, based on standard Java benchmarks and open-source applications, indicates that inexpensive and accurate yield prediction can improve performance significantly.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Menon:2009:TSA, author = "Aravind Menon and Simon Schubert and Willy Zwaenepoel", title = "{TwinDrivers}: semi-automatic derivation of fast and safe hypervisor network drivers from guest {OS} drivers", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "301--312", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508279", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In a virtualized environment, device drivers are often run inside a virtual machine (VM) rather than in the hypervisor, for reasons of safety and reduction in software engineering effort. Unfortunately, this approach results in poor performance for I/O-intensive devices such as network cards. The alternative approach of running device drivers directly in the hypervisor yields better performance, but results in the loss of safety guarantees for the hypervisor and incurs additional software engineering costs. In this paper we present TwinDrivers, a framework which allows us to semi-automatically create safe and efficient hypervisor drivers from guest OS drivers. The hypervisor driver runs directly in the hypervisor, but its data resides completely in the driver VM address space. A Software Virtual Memory mechanism allows the driver to access its VM data efficiently from the hypervisor running in any guest context, and also protects the hypervisor from invalid memory accesses from the driver. An upcall mechanism allows the hypervisor to largely reuse the driver support infrastructure present in the VM. The TwinDriver system thus combines most of the performance benefits of hypervisor-based driver approaches with the safety and software engineering benefits of VM-based driver approaches. Using the TwinDrivers hypervisor driver, we are able to improve the guest domain networking throughput in Xen by a factor of 2.4 for transmit workloads, and 2.1 for receive workloads, both in CPU-scaled units, and achieve close to 64-67 of native Linux throughput.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Burcea:2009:PBV, author = "Ioana Burcea and Andreas Moshovos", title = "{Phantom-BTB}: a virtualized branch target buffer design", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "313--324", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508281", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Modern processors use branch target buffers (BTBs) to predict the target address of branches such that they can fetch ahead in the instruction stream increasing concurrency and performance. Ideally, BTBs would be sufficiently large to capture the entire working set of the application and sufficiently small for fast access and practical on-chip dedicated storage. Depending on the application, these requirements are at odds. This work introduces a BTB design that accommodates large instruction footprints without dedicating expensive onchip resources. In the proposed Phantom-BTB (PBTB) design, a conventional BTB is augmented with a virtual table that collects branch target information as the application runs. The virtual table does not have fixed dedicated storage. Instead, it is transparently allocated, on demand, in the on-chip caches, at cache line granularity. The entries in the virtual table are proactively prefetched and installed in the dedicated conventional BTB, thus, increasing its perceived capacity. Experimental results with commercial workloads under full-system simulation demonstrate that PBTB improves IPC performance over a 1K-entry BTB by 6.9\% on average and up to 12.7\%, with a storage overhead of only 8\%. Overall, the virtualized design performs within 1\% of a conventional 4K-entry, single-cycle access BTB, while the dedicated storage is 3.6 times smaller.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Ramani:2009:SSF, author = "Karthik Ramani and Christiaan P. Gribble and Al Davis", title = "{StreamRay}: a stream filtering architecture for coherent ray tracing", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "325--336", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508282", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The wide availability of commodity graphics processors has made real-time graphics an intrinsic component of the human/computer interface. These graphics cores accelerate the z-buffer algorithm and provide a highly interactive experience at a relatively low cost. However, many applications in entertainment, science, and industry require high quality lighting effects such as accurate shadows, reflection, and refraction. These effects can be difficult to achieve with z-buffer algorithms but are straightforward to implement using ray tracing. Although ray tracing is computationally more complex, the algorithm exhibits excellent scaling and parallelism properties. Nevertheless, ray tracing memory access patterns are difficult to predict and the parallelism speedup promise is therefore hard to achieve. This paper highlights a novel approach to ray tracing based on stream filtering and presents StreamRay, a multicore wide SIMD microarchitecture that delivers interactive frame rates of 15-32 frames/second for scenes of high geometric complexity and exhibits high utilization for SIMD widths ranging from eight to 16 elements. StreamRay consists of two main components: the ray engine, which is responsible for stream assembly and employs address generation units that generate addresses to form large SIMD vectors, and the filter engine, which implements the ray tracing operations with programmable accelerators. Results demonstrate that separating address and data processing reduces data movement and resource contention. Performance improves by 56\% while simultaneously providing 11.63\% power savings per accelerator core compared to a design which does not use separate resources for address and data computations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Cameron:2009:ASS, author = "Robert D. Cameron and Dan Lin", title = "Architectural support for {SWAR} text processing with parallel bit streams: the inductive doubling principle", journal = j-COMP-ARCH-NEWS, volume = "37", number = "1", pages = "337--348", month = mar, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2528521.1508283", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:47:19 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Parallel bit stream algorithms exploit the SWAR (SIMD within a register) capabilities of commodity processors in high-performance text processing applications such as UTF-8 to UTF-16 transcoding, XML parsing, string search and regular expression matching. Direct architectural support for these algorithms in future SWAR instruction sets could further increase performance as well as simplifying the programming task. A set of simple SWAR instruction set extensions are proposed for this purpose based on the principle of systematic support for inductive doubling as an algorithmic technique. These extensions are shown to significantly reduce instruction count in core parallel bit stream algorithms, often providing a 3X or better improvement. The extensions are also shown to be useful for SWAR programming in other application areas, including providing a systematic treatment for horizontal operations. An implementation model for these extensions involves relatively simple circuitry added to the operand fetch components in a pipelined processor.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS 2009 conference proceedings.", } @Article{Jouppi:2009:ISI, author = "Norman P. Jouppi and Rakesh Kumar and Dean Tullsen", title = "Introduction to the special issue on the {2008 Workshop on Design, Analysis, and Simulation of Chip Multiprocessors (dasCMP'08)}", journal = j-COMP-ARCH-NEWS, volume = "37", number = "2", pages = "1--1", month = may, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1577129.1577131", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:39 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zeng:2009:MCA, author = "Hui Zeng and Matt Yourst and Kanad Ghose and Dmitry Ponomarev", title = "{MPTLsim}: a cycle-accurate, full-system simulator for x86-64 multicore architectures with coherent caches", journal = j-COMP-ARCH-NEWS, volume = "37", number = "2", pages = "2--9", month = may, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1577129.1577132", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:39 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The introduction of multicore microprocessors in the recent years has made it imperative to use cycle-accurate and full-system simulators in the architecture research community. We introduce MPTLsim a multicore simulator for the X86 ISA that meets this need. MPTLsim is a uop-accurate, cycle-accurate, full-system simulator for multicore designs based on the X86-64 ISA. MPTLsim extends PTLsim, a publicly available single core simulator, with a host of additional features to support hyperthreading within a core and multiple cores, with detailed models for caches, on-chip interconnections and the memory data flow. MPTLsim incorporates detailed simulation models for cache controllers, interconnections and has built-in implementations of a number of cache coherency protocols.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Monchiero:2009:HSC, author = "Matteo Monchiero and Jung Ho Ahn and Ayose Falc{\'o}n and Daniel Ortega and Paolo Faraboschi", title = "How to simulate 1000 cores", journal = j-COMP-ARCH-NEWS, volume = "37", number = "2", pages = "10--19", month = may, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1577129.1577133", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:39 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper proposes a novel methodology to efficiently simulate shared-memory multiprocessors composed of hundreds of cores. The basic idea is to use thread-level parallelism in the software system and translate it into core-level parallelism in the simulated world. To achieve this, we first augment an existing full-system simulator to identify and separate the instruction streams belonging to the different software threads. Then, the simulator dynamically maps each instruction flow to the corresponding core of the target multi-core architecture, taking into account the inherent thread synchronization of the running applications. Our simulator allows a user to execute any multithreaded application in a conventional full-system simulator and evaluate the performance of the application on a many-core hardware. We carried out extensive simulations on the SPLASH-2 benchmark suite and demonstrated the scalability up to 1024 cores with limited simulation speed degradation vs. the single-core case on a fixed workload. The results also show that the proposed technique captures the intrinsic behavior of the SPLASH-2 suite, even when we scale up the number of shared-memory cores beyond the thousand-core limit.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:2009:SPP, author = "Jianwei Chen and Murali Annavaram and Michel Dubois", title = "{SlackSim}: a platform for parallel simulations of {CMPs} on {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "37", number = "2", pages = "20--29", month = may, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1577129.1577134", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:39 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The fast simulation of chip multiprocessors (CMPs) presents a critical challenge to the architecture research community as both industry and academia shift their research focus to multicore design. Parallel simulation is a technique to accelerate microarchitecture simulation of CMPs by exploiting the inherent parallelism of CMPs. In this paper, we explore the simulation paradigm of simulating each core of a target CMP in one thread and then spreading the threads across the hardware thread contexts of a host CMP. We implement several parallel simulation schemes using POSIX Threads (Pthreads). We start with cycle-by-cycle simulation and then relax the synchronization condition in various schemes, which we call slack simulations.\par In slack simulations, the Pthreads simulating different simulated cores do not synchronize after each simulated cycle, but rather they are given some slack. The slack is the difference in cycle between the simulated times of any two target cores. Small slacks, such as a few cycles, greatly improve the efficiency of parallel CMP simulations, with no or negligible simulation error. We have developed a simulation framework called SlackSim to experiment with various slack simulation schemes. Unlike previous attempts to parallelize multiprocessor simulations on distributed memory machines, SlackSim takes advantage of the efficient sharing of data in the host CMP architecture.\par We demonstrate the efficiency and accuracy of some well known slack simulation schemes and of some new ones on SlackSim running on a state-of-the-art CMP platform.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Purnaprajna:2009:RTR, author = "Madhura Purnaprajna and Mario Porrmann and Ulrich Rueckert", title = "Run-time reconfigurability in embedded multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "37", number = "2", pages = "30--37", month = may, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1577129.1577135", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:39 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "To meet application-specific performance demands, architectures are predominantly redesigned and customised. Every architectural change results in huge overheads in design, verification, and fabrication, which together result in prolonged time-to-market. As an alternative, configurable architectures provide easy adaptability to different application domains in place of costly redesigns. To deal with application changes and custom requirements, a method of configuring and reusing the basic building blocks within processors is developed. Additionally, this enables co-operative multiprocessing. In this paper, a runtime reconfiguration mechanism for embedded multiprocessor architectures is proposed as a method to introduce customisations in the post-fabrication phase. A method of application description in conjunction with a flexible reconfigurable multiprocessor template is presented. Finally, the costs and benefits of this approach are analysed for computationally intensive algorithms used in digital signal processing. The impact of application specific characteristics on execution time, power consumption, and total energy dissipation are analysed.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jesshope:2009:ISM, author = "Chris Jesshope and Mike Lankamp and Li Zhang", title = "The implementation of an {SVP} many-core processor and the evaluation of its memory architecture", journal = j-COMP-ARCH-NEWS, volume = "37", number = "2", pages = "38--45", month = may, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1577129.1577136", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:39 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Many-core processor architectures require scalable solutions that reflect the locality and power constraints of future generations of silicon technology. This paper presents a many-core processor that supports an abstract model of concurrency, based on a Self-adaptive Virtual Processor (SVP). This processor implements instructions, which automatically map and schedule threads providing a code devoid of any explicit communication. The thrust of this approach is to produce binary code that is divorced from implementation parameters, yet, which still gives good performance over future generations of CMPs. A key component of this processor architecture is the memory system. This paper briefly presents the model and evaluates its memory architecture.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Singh:2009:RTP, author = "Karan Singh and Major Bhadauria and Sally A. McKee", title = "Real time power estimation and thread scheduling via performance counters", journal = j-COMP-ARCH-NEWS, volume = "37", number = "2", pages = "46--55", month = may, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1577129.1577137", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:39 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Estimating power consumption is critical for hardware and software developers, and of the latter, particularly for OS programmers writing process schedulers. However, obtaining processor and system power consumption information can be non-trivial. Simulators are time consuming and prone to error. Power meters report whole-system consumption, but cannot give per-processor or per-thread information. More intrusive hardware instrumentation is possible, but such solutions are usually employed while designing the system, and are not meant for customer use.\par Given these difficulties, plus the current availability of some form of performance counters on virtually all platforms (even though such counters were initially designed for system bring-up, and not intended for general programmer consumption), we analytically derive functions for real-time estimation of processor and system power consumption using performance counter data on real hardware. Our model uses data gathered from microbenchmarks that capture potential application behavior. The model is independent of our test benchmarks, and thus we expect it to be well suited for future applications. We target chip multiprocessors, analyzing effects of shared resources and temperature on power estimation, leveraging our model to implement a simple, power-aware thread scheduler. The NAS and SPEC-OMP benchmarks shows a median error of 5.8\% and 3.9\%, respectively. SPEC 2006 shows a marginally higher median error of 7.2\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Azizi:2009:AEC, author = "Omid Azizi and Aqeel Mahesri and Sanjay J. Patel and Mark Horowitz", title = "Area-efficiency in {CMP} core design: co-optimization of microarchitecture and physical design", journal = j-COMP-ARCH-NEWS, volume = "37", number = "2", pages = "56--65", month = may, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1577129.1577138", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:39 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper, we examine the area-performance design space of a processing core for a chip multiprocessor (CMP), considering both the architectural design space and the tradeoffs of the physical design on which the architecture relies. We first propose a methodology for performing an integrated optimization of both the micro-architecture and the physical circuit design of a microprocessor. In our approach, we use statistical and convex fitting methods to capture a large micro-architectural design space. We then characterize the area-delay tradeoffs of the underlying circuits through RTL synthesis. Finally, we establish the relationship between the architecture and the circuits in an integrative model, which we use to optimize the processor. As a case study, we apply this methodology to explore the performance-area tradeoffs in a highly parallel accelerator architecture for visual computing applications. Based on some early circuit tradeoff data, our results indicate that two separate designs are performance/area optimal for our set of benchmarks: a simpler single-issue, 2-way multithreaded core running at high-frequency, and a more aggressively tuned dual-issue 4-way multithreaded design running at a lower frequency.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2009:INa, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "37", number = "2", pages = "66--69", month = may, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1577129.1577140", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:39 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yelick:2009:TWW, author = "Katherine Yelick", title = "Ten ways to waste a parallel computer", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "1--1", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555755", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As clock speed increases taper off and hardware designers struggle to scale parallelism within a chip, software developers and researchers must face the challenge of writing portable software with no clear architectural target. On the hardware side, energy considerations will dominate many of the design decisions, and will ultimately limit what systems and applications can be built. This is especially true at the high end, where the next major milestone of exascale computing will be unattainable without major improvements in efficiency.\par Although hardware designers have long worried about the efficiency of their designs, especially for battery-operated devices, software developers in general have not. To illustrate this point, I will describe some of the top ways to waste time and therefore energy waiting for communication, synchronization, or interactions with users or other systems. Data movement, rather than computation, is the big consumer of energy, yet software often moves data up and down the memory hierarchy or across a network multiple times. At the same time, hardware designers need to take into account the constraints of the computational problems that will run on their systems, as a design that is poorly matched to the computational requirements will end up being inefficient. Drawing on my own experience in scientific computing, I will give examples of how to make the combination of hardware, algorithms and software more efficient, but also describe some of the challenges that are inherent in the application problems we want to solve. The community needs to take an integrated approach to the problem, and consider how much business or science can be done per Joule, rather than optimizing a particular component of the system in isolation. This will require rethinking the algorithms, programming models, and hardware in concert, and therefore an unprecedented level of collaboration and cooperation between hardware and software designers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "energy; parallel computer", } @Article{Lee:2009:APC, author = "Benjamin C. Lee and Engin Ipek and Onur Mutlu and Doug Burger", title = "Architecting phase change memory as a scalable {DRAM} alternative", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "2--13", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555758", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Memory scaling is in jeopardy as charge storage and sensing mechanisms become less reliable for prevalent memory technologies, such as DRAM. In contrast, phase change memory (PCM) storage relies on scalable current and thermal mechanisms. To exploit PCM's scalability as a DRAM alternative, PCM must be architected to address relatively long latencies, high energy writes, and finite endurance.\par We propose, crafted from a fundamental understanding of PCM technology parameters, area-neutral architectural enhancements that address these limitations and make PCM competitive with DRAM. A baseline PCM system is 1.6x slower and requires 2.2x more energy than a DRAM system. Buffer reorganizations reduce this delay and energy gap to 1.2x and 1.0x, using narrow rows to mitigate write energy and multiple rows to improve locality and write coalescing. Partial writes enhance memory endurance, providing 5.6 years of lifetime. Process scaling will further reduce PCM energy costs and improve endurance.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "DRAM alternative; endurance; energy; PCM; performance; phase change memory; power; scalability", } @Article{Zhou:2009:DEE, author = "Ping Zhou and Bo Zhao and Jun Yang and Youtao Zhang", title = "A durable and energy efficient main memory using phase change memory technology", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "14--23", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555759", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Using nonvolatile memories in memory hierarchy has been investigated to reduce its energy consumption because nonvolatile memories consume zero leakage power in memory cells. One of the difficulties is, however, that the endurance of most nonvolatile memory technologies is much shorter than the conventional SRAM and DRAM technology. This has limited its usage to only the low levels of a memory hierarchy, e.g., disks, that is far from the CPU.\par In this paper, we study the use of a new type of nonvolatile memories -- the Phase Change Memory (PCM) as the main memory for a 3D stacked chip. The main challenges we face are the limited PCM endurance, longer access latencies, and higher dynamic power compared to the conventional DRAM technology. We propose techniques to extend the endurance of the PCM to an average of 13 (for MLC PCM cell) to 22 (for SLC PCM) years. We also study the design choices of implementing PCM to achieve the best tradeoff between energy and performance. Our design reduced the total energy of an already low-power DRAM main memory of the same capacity by 65\%, and energy-delay$^2$ product by 60\%. These results indicate that it is feasible to use PCM technology in place of DRAM in the main memory for better energy efficiency.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "endurance; low power; phase change memory", } @Article{Qureshi:2009:SHP, author = "Moinuddin K. Qureshi and Vijayalakshmi Srinivasan and Jude A. Rivers", title = "Scalable high performance main memory system using phase-change memory technology", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "24--33", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555760", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The memory subsystem accounts for a significant cost and power budget of a computer system. Current DRAM-based main memory systems are starting to hit the power and cost limit. An alternative memory technology that uses resistance contrast in phase-change materials is being actively investigated in the circuits community. {\em Phase Change Memory (PCM)\/} devices offer more density relative to DRAM, and can help increase main memory capacity of future systems while remaining within the cost and power constraints.\par In this paper, we analyze a PCM-based hybrid main memory system using an architecture level model of PCM. We explore the trade-offs for a main memory system consisting of PCMstorage coupled with a small DRAM buffer. Such an architecture has the latency benefits of DRAM and the capacity benefits of PCM. Our evaluations for a baseline system of 16-cores with 8GB DRAM show that, on average, PCM can reduce page faults by 5X and provide a speedup of 3X. As PCM is projected to have limited write endurance, we also propose simple organizational and management solutions of the hybrid memory that reduces the write traffic to PCM, boosting its lifetime from 3 years to 9.7 years.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "DRAM caching; phase change memory; wear leveling", } @Article{Wu:2009:HCA, author = "Xiaoxia Wu and Jian Li and Lixin Zhang and Evan Speight and Ram Rajamony and Yuan Xie", title = "Hybrid cache architecture with disparate memory technologies", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "34--45", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555761", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Caching techniques have been an efficient mechanism for mitigating the effects of the processor-memory speed gap. Traditional multi-level SRAM-based cache hierarchies, especially in the context of chip multiprocessors (CMPs), present many challenges in area requirements, core-to-cache balance, power consumption, and design complexity. New advancements in technology enable caches to be built from other technologies, such as Embedded DRAM (EDRAM), Magnetic RAM (MRAM), and Phase-change RAM (PRAM), in both 2D chips or 3D stacked chips. Caches fabricated in these technologies offer dramatically different power and performance characteristics when compared with SRAM-based caches, particularly in the areas of access latency, cell density, and overall power consumption. In this paper, we propose to take advantage of the best characteristics that each technology offers, through the use of Hybrid Cache Architecture (HCA) designs. We discuss and evaluate two types of hybrid cache architectures: inter cache Level HCA (LHCA), in which the levels in a cache hierarchy can be made of disparate memory technologies; and intra cache level or cache Region based HCA (RHCA), where a single level of cache can be partitioned into multiple regions, each of a different memory technology. We have studied a number of different HCA architectures and explored the potential of hardware support for intra-cache data movement and power consumption management within HCA caches. Utilizing a full-system simulator that has been validated against real hardware, we demonstrate that an LHCA design can provide a geometric mean 7\% IPC improvement over a baseline 3-level SRAM cache design under the same area constraint across a collection of 25 workloads. A more aggressive RHCA-based design provides 12\% IPC improvement over the baseline. Finally, a 2-layer 3D cache stack (3DHCA) of high density memory technology within the same chip footprint gives 18\% IPC improvement over the baseline. Furthermore, up to 70\% reduction in power consumption over a baseline SRAM-only design is achieved.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "hybrid cache architecture; three-dimensional IC", } @Article{Suh:2009:DMR, author = "Jinho Suh and Michel Dubois", title = "Dynamic {MIPS} rate stabilization in out-of-order processors", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "46--56", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555763", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Today's microprocessor cores reach high performance levels not only by their high clock rate but also by the concurrent execution of a large number of instructions. Because of the relationship between power and frequency, it becomes attractive to run an OoO (Out-of-Order) core at a frequency lower than its nominal frequency in the context of embedded or real-time systems. Unfortunately, whereas OoO pipelines have high average throughput, their highly variable and hard-to-predict execution rate makes them unsuitable for real-time systems with hard or even soft deadlines. In this paper, we demonstrate that the execution time of an OoO processor can be stable and predictable by controlling its MIPS (Mega Instructions Per Second) rate via a PID (Proportional, Integral, and Differential gain) feedback controller and DVFS (Dynamic Voltage and Frequency Scaling). The stabilized processor uses much less power per committed instruction, because of the reduced average frequency. The EPI (Energy Per Instruction) is also cut by an average of 28\% across our benchmark programs. Since a stable MIPS rate is maintained consistently with lower power/energy per instruction, OoO processors stabilized by a feedback controller can realistically be deployed in real-time systems. To demonstrate this capability we select a subset of the MiBench benchmarks that displays the widest execution rate variations and stabilize their MIPS rate in the context of a 1GHz Pentium III-like microarchitecture.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "embedded systems; OoO processors; real-time systems; stabilization; variability", } @Article{Paolieri:2009:HSW, author = "Marco Paolieri and Eduardo Qui{\~n}ones and Francisco J. Cazorla and Guillem Bernat and Mateo Valero", title = "Hardware support for {WCET} analysis of hard real-time multicore systems", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "57--68", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555764", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The increasing demand for new functionalities in current and future hard real-time embedded systems like automotive, avionics and space industries is driving an increase in the performance required in embedded processors. Multicore processors represent a good design solution for such systems due to their high performance, low cost and power consumption characteristics. However, hard real-time embedded systems require time analyzability and current multicore processors are less analyzable than single-core processors due to the interferences between different tasks when accessing shared hardware resources. In this paper we propose a multicore architecture with shared resources that allows the execution of applications with hard real-time and non hard real-time constraints at the same time, providing time analizability for the hard real-time tasks so that they can meet their deadlines. Moreover our architecture proposal provides high-performance for the non hard real-time tasks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "analyzability; cache partitioning; hard real-time; interconnection network; multicore; real-time embedded systems; WCET", } @Article{Somogyi:2009:STM, author = "Stephen Somogyi and Thomas F. Wenisch and Anastasia Ailamaki and Babak Falsafi", title = "Spatio-temporal memory streaming", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "69--80", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555766", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Recent research advocates memory streaming techniques to alleviate the performance bottleneck caused by the high latencies of off-chip memory accesses. Temporal memory streaming replays previously observed miss sequences to eliminate long chains of dependent misses. Spatial memory streaming predicts repetitive data layout patterns within fixed-size memory regions. Because each technique targets a different subset of misses, their effectiveness varies across workloads and each leaves a significant fraction of misses unpredicted.\par In this paper, we propose Spatio-Temporal Memory Streaming (STeMS) to exploit the synergy between spatial and temporal streaming. We observe that the order of spatial accesses repeats both within and across regions. STeMS records and replays the temporal sequence of region accesses and uses spatial relationships within each region to dynamically reconstruct a predicted total miss order. Using trace-driven and cycle-accurate simulation across a suite of commercial workloads, we demonstrate that with similar implementation complexity as temporal streaming, STeMS achieves equal or higher coverage than spatial or temporal memory streaming alone, and improves performance by 31\%, 3\%, and 18\% over stride, spatial, and temporal prediction, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "prefetching; spatial correlation; temporal correlation", } @Article{Diaz:2009:SCE, author = "Pedro Diaz and Marcelo Cintra", title = "Stream chaining: exploiting multiple levels of correlation in data prefetching", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "81--92", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555767", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Data prefetching has long been an important technique to amortize the effects of the memory wall, and is likely to remain so in the current era of multi-core systems. Most prefetchers operate by identifying patterns and correlations in the miss address stream. Separating streams according to the memory access instruction that generates the misses is an effective way of filtering out spurious addresses from predictable streams. On the other hand, by localizing streams based on the memory access instructions, such prefetchers both lose the complete time sequence information of misses and can only issue prefetches for a single memory access instruction at a time.\par This paper proposes a novel class of prefetchers based on the idea of linking various localized streams into predictable chains of missing memory access instructions such that the prefetcher can issue prefetches along multiple streams. In this way the prefetcher is not limited to prefetching deeply for a single missing memory access instruction but can instead adaptively prefetch for other memory access instructions closer in time.\par Experimental results show that the proposed prefetcher consistently achieves better performance than a state-of-the-art prefetcher -- 10\% on average, being only outperformed in very few cases and then by only 2\%, and outperforming that prefetcher by as much as 55\% -- while consuming the same amount of memory bandwidth.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "data prefetching", } @Article{Powell:2009:ACS, author = "Michael D. Powell and Arijit Biswas and Shantanu Gupta and Shubhendu S. Mukherjee", title = "Architectural core salvaging in a multi-core processor for hard-error tolerance", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "93--104", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555769", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The incidence of hard errors in CPUs is a challenge for future multicore designs due to increasing total core area. Even if the location and nature of hard errors are known a priori, either at manufacture-time or in the field, cores with such errors must be disabled in the absence of hard-error tolerance. While caches, with their regular and repetitive structures, are easily covered against hard errors by providing spare arrays or spare lines, structures within a core are neither as regular nor as repetitive. Previous work has proposed microarchitectural core salvaging to exploit structural redundancy within a core and maintain functionality in the presence of hard errors. Unfortunately microarchitectural salvaging introduces complexity and may provide only limited coverage of core area against hard errors due to a lack of natural redundancy in the core.\par This paper makes a case for architectural core salvaging. We observe that even if some individual cores cannot execute certain operations, a CPU die can be instruction-set-architecture (ISA) compliant, that is execute all of the instructions required by its ISA, by exploiting natural cross-core redundancy. We propose using hardware to migrate offending threads to another core that can execute the operation. Architectural core salvaging can cover a large core area against faults, and be implemented by leveraging known techniques that minimize changes to the microarchitecture. We show it is possible to optimize architectural core salvaging such that the performance on a faulty die approaches that of a fault-free die--assuring significantly better performance than core disabling for many workloads and no worse performance than core disabling for the remainder.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "core salvaging; hard errors; redundancy; reliability", } @Article{Carretero:2009:EER, author = "Javier Carretero and Pedro Chaparro and Xavier Vera and Jaume Abella and Antonio Gonz{\'a}lez", title = "End-to-end register data-flow continuous self-test", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "105--115", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555770", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "While Moore's Law predicts the ability of semi-conductor industry to engineer smaller and more efficient transistors and circuits, there are serious issues not contemplated in that law. One concern is the verification effort of modern computing systems, which has grown to dominate the cost of system design. On the other hand, technology scaling leads to burn-in phase out. As a result, in-the-field error rate may increase due to both actual errors and latent defects. Whereas data can be protected with arithmetic codes (like parity or ECC), there is a lack of cost-effective mechanisms for control logic.\par This paper presents a light-weight microarchitectural mechanism that ensures that data consumed through registers are correct. Microarchitecture presents a new way to manage reliability and testing without significantly sacrificing cost and performance, offering a unique opportunity to detect errors in the field at low cost. Our results show a coverage around 90\% for the targeted structures with a cost in power and area of about 4\%. The structures protected include the issue queue logic and the data associated (i.e., tags, control signals), input multiplexors, rename data, replay logic, register free list, bypasses data and logic, MOB data and addresses, register file logic, register file storage and functional units.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "control logic; degradation; design errors; end-to-end protection; online testing", } @Article{Yoon:2009:MME, author = "Doe Hyun Yoon and Mattan Erez", title = "Memory mapped {ECC}: low-cost error protection for last level caches", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "116--127", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555771", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents a novel technique, Memory Mapped ECC, which reduces the cost of providing error correction for SRAM caches. It is important to limit such overheads as processor resources become constrained and error propensity increases. The continuing decrease in SRAM cell size and the growing capacity of caches increases the likelihood of errors in SRAM arrays. To address this, redundant information can be used to correct a value after an error occurs. Information redundancy is typically provided through error-correcting codes (ECC), which append bits to every SRAM row and increase the array's area and energy consumption. We make three observations regarding error protection and utilize them in our architecture: (1) much of the data in a cache is replicated throughout the hierarchy and is inherently redundant; (2) error-detection is necessary for every cache access and is cheaper than error correction, which is very infrequent; (3) redundant information for correction need not be stored in high-cost SRAM. Our unique architecture only dedicates SRAM for error detection while the ECC bits are stored within the memory hierarchy as data. We associate a physical memory address with each cache line for ECC storage and rely on locality to minimize the impact. The cache is dynamically and transparently partitioned between data and ECC with the fraction of ECC growing with the number of dirty cache lines. We show that this has little impact on both performance (1.3\% average and < 4\%) and memory traffic (3\%) across a range of memory-intensive applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "error correction; last-level caches; reliability; soft error", } @Article{Woh:2009:AAA, author = "Mark Woh and Sangwon Seo and Scott Mahlke and Trevor Mudge and Chaitali Chakrabarti and Krisztian Flautner", title = "{AnySP}: anytime anywhere anyway signal processing", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "128--139", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555773", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In the past decade, the proliferation of mobile devices has increased at a spectacular rate. There are now more than 3.3 billion active cell phones in the world---a device that we now all depend on in our daily lives. The current generation of devices employs a combination of general-purpose processors, digital signal processors, and hardwired accelerators to provide giga-operations-per-second performance on milliwatt power budgets. Such heterogeneous organizations are inefficient to build and maintain, as well as waste silicon area and power. Looking forward to the next generation of mobile computing, computation requirements will increase by one to three orders of magnitude due to higher data rates, increased complexity algorithms, and greater computation diversity but the power requirements will be just as stringent. Scaling of existing approaches will not suffice instead the inherent computational efficiency, programmability, and adaptability of the hardware must change. To overcome these challenges, this paper proposes an example architecture, referred to as AnySP, for the next generation mobile signal processing. AnySP uses a co-design approach where the next generation wireless signal processing and high-definition video algorithms are analyzed to create a domain specific programmable architecture. At the heart of AnySP is a configurable single-instruction multiple-data datapath that is capable of processing wide vectors or multiple narrow vectors simultaneously. In addition, deeper computation subgraphs can be pipelined across the single-instruction multiple-data lanes. These three operating modes provide high throughput across varying application types. Results show that AnySP is capable of sustaining 4G wireless processing and high-definition video throughput rates, and will approach the 1000 Mops/mW efficiency barrier when scaled to 45nm.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "fully programmable architecture; high-end signal processing; low-power architecture; SIMD; single-instruction multiple-data parallelism; software defined radio", } @Article{Kelm:2009:RAS, author = "John H. Kelm and Daniel R. Johnson and Matthew R. Johnson and Neal C. Crago and William Tuohy and Aqeel Mahesri and Steven S. Lumetta and Matthew I. Frank and Sanjay J. Patel", title = "{Rigel}: an architecture and scalable programming interface for a 1000-core accelerator", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "140--151", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555774", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper considers Rigel, a programmable accelerator architecture for a broad class of data- and task-parallel computation. Rigel comprises 1000+ hierarchically-organized cores that use a fine-grained, dynamically scheduled single-program, multiple-data (SPMD) execution model. Rigel's low-level programming interface adopts a single global address space model where parallel work is expressed in a task-centric, bulk-synchronized manner using minimal hardware support. Compared to existing accelerators, which contain domain-specific hardware, specialized memories, and/or restrictive programming models, Rigel is more flexible and provides a straightforward target for a broader set of applications.\par We perform a design analysis of Rigel to quantify the compute density and power efficiency of our initial design. We find that Rigel can achieve a density of over 8 single-precision GFLOPS/mm$^2$ in 45nm, which is comparable to high-end GPUs scaled to 45nm. We perform experimental analysis on several applications ported to the Rigel low-level programming interface. We examine scalability issues related to work distribution, synchronization, and load-balancing for 1000-core accelerators using software techniques and minimal specialized hardware support. We find that while it is important to support fast task distribution and barrier operations, these operations can be implemented without specialized hardware using flexible hardware primitives.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "accelerator; computer architecture; low-level programming interface", } @Article{Hong:2009:AMG, author = "Sunpyo Hong and Hyesoon Kim", title = "An analytical model for a {GPU} architecture with memory-level and thread-level parallelism awareness", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "152--163", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555775", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "GPU architectures are increasingly important in the multi-core era due to their high number of parallel processors. Programming thousands of massively parallel threads is a big challenge for software engineers, but understanding the performance bottlenecks of those parallel programs on GPU architectures to improve application performance is even more difficult. Current approaches rely on programmers to tune their applications by exploiting the design space exhaustively without fully understanding the performance characteristics of their applications.\par To provide insights into the performance bottlenecks of parallel applications on GPU architectures, we propose a simple analytical model that estimates the execution time of massively parallel programs. The key component of our model is estimating the number of parallel memory requests (we call this the memory warp parallelism) by considering the number of running threads and memory bandwidth. Based on the degree of memory warp parallelism, the model estimates the cost of memory requests, thereby estimating the overall execution time of a program. Comparisons between the outcome of the model and the actual execution time in several GPUs show that the geometric mean of absolute error of our model on micro-benchmarks is 5.4\% and on GPU computing applications is 13.3\%. All the applications are written in the CUDA programming language.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "analytical model; CUDA; GPU architecture; memory level parallelism; performance estimation; warp level parallelism", } @Article{Biswas:2009:MEM, author = "Susmit Biswas and Diana Franklin and Alan Savage and Ryan Dixon and Timothy Sherwood and Frederic T. Chong", title = "Multi-execution: multicore caching for data-similar executions", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "164--173", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555777", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "While microprocessor designers turn to multicore architectures to sustain performance expectations, the dramatic increase in parallelism of such architectures will put substantial demands on off-chip bandwidth and make the memory wall more significant than ever. This paper demonstrates that one profitable application of multicore processors is the execution of many similar instantiations of the same program. We identify that this model of execution is used in several practical scenarios and term it as 'multi-execution.' Often, each such instance utilizes very similar data. In conventional cache hierarchies, each instance would cache its own data independently. We propose the Mergeable cache architecture that detects data similarities and merges cache blocks, resulting in substantial savings in cache storage requirements. This leads to reductions in off-chip memory accesses and overall power usage, and increases in application performance. We present cycle-accurate simulation results of 8 benchmarks (6 from SPEC2000) to demonstrate that our technique provides a scalable solution and leads to significant speedups due to reductions in main memory accesses. For 8 cores running 8 similar executions of the same application and sharing an exclusive 4-MB, 8-way L2 cache, the Mergeable cache shows a speedup in execution by 2.5x on average (ranging from 0.93x to 6.92x), while posing an overhead of only 4.28\% on cache area and 5.21\% on power when it is used.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "CMP; data similar execution; multicore cache design", } @Article{Xie:2009:PPI, author = "Yuejian Xie and Gabriel H. Loh", title = "{PIPP}: promotion\slash insertion pseudo-partitioning of multi-core shared caches", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "174--183", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555778", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Many multi-core processors employ a large last-level cache (LLC) shared among the multiple cores. Past research has demonstrated that sharing-oblivious cache management policies (e.g., LRU) can lead to poor performance and fairness when the multiple cores compete for the limited LLC capacity. Different memory access patterns can cause cache contention in different ways, and various techniques have been proposed to target some of these behaviors. In this work, we propose a new cache management approach that combines dynamic insertion and promotion policies to provide the benefits of cache partitioning, adaptive insertion, and capacity stealing all with a single mechanism. By handling multiple types of memory behaviors, our proposed technique outperforms techniques that target only either capacity partitioning or adaptive insertion.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cache; contention; insertion; multi-core; promotion; sharing", } @Article{Hardavellas:2009:RNN, author = "Nikos Hardavellas and Michael Ferdman and Babak Falsafi and Anastasia Ailamaki", title = "{Reactive NUCA}: near-optimal block placement and replication in distributed caches", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "184--195", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555779", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Increases in on-chip communication delay and the large working sets of server and scientific workloads complicate the design of the on-chip last-level cache for multicore processors. The large working sets favor a shared cache design that maximizes the aggregate cache capacity and minimizes off-chip memory requests. At the same time, the growing on-chip communication delay favors core-private caches that replicate data to minimize delays on global wires. Recent hybrid proposals offer lower average latency than conventional designs, but they address the placement requirements of only a subset of the data accessed by the application, require complex lookup and coherence mechanisms that increase latency, or fail to scale to high core counts.\par In this work, we observe that the cache access patterns of a range of server and scientific workloads can be classified into distinct classes, where each class is amenable to different block placement policies. Based on this observation, we propose Reactive NUCA (R-NUCA), a distributed cache design which reacts to the class of each cache access and places blocks at the appropriate location in the cache. R-NUCA cooperates with the operating system to support intelligent placement, migration, and replication without the overhead of an explicit coherence mechanism for the on-chip last-level cache. In a range of server, scientific, and multiprogrammed workloads, R-NUCA matches the performance of the best cache design for each workload, improving performance by 14\% on average over competing designs and by 32\% at best, while achieving performance within 5\% of an ideal cache design.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "block migration; block placement; block replication; cache; cache coherence; cache indexing; cache lookup; cache management; chip multiprocessor; cmp; coherence; data migration; data placement; data replication; interleaving; last-level cache; lookup; migration; multi-core; multicore; non-uniform cache access; NUCA; placement; private cache; R-NUCA; Reactive NUCA; replication; rotational interleaving; shared cache", } @Article{Moscibroda:2009:CBR, author = "Thomas Moscibroda and Onur Mutlu", title = "A case for bufferless routing in on-chip networks", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "196--207", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555781", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Buffers in on-chip networks consume significant energy, occupy chip area, and increase design complexity. In this paper, we make a case for a new approach to designing on-chip interconnection networks that eliminates the need for buffers for routing or flow control. We describe new algorithms for routing without using buffers in router input/output ports. We analyze the advantages and disadvantages of bufferless routing and discuss how router latency can be reduced by taking advantage of the fact that input/output buffers do not exist. Our evaluations show that routing without buffers significantly reduces the energy consumption of the on-chip cache/processor-to-cache network, while providing similar performance to that of existing buffered routing algorithms at low network utilization (i.e., on most real applications). We conclude that bufferless routing can be an attractive and energy-efficient design option for on-chip cache/processor-to-cache networks where network utilization is low.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "memory systems; multi-core; on-chip networks; routing", } @Article{Kinsy:2009:AAD, author = "Michel A. Kinsy and Myong Hyon Cho and Tina Wen and Edward Suh and Marten van Dijk and Srinivas Devadas", title = "Application-aware deadlock-free oblivious routing", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "208--219", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555782", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Conventional oblivious routing algorithms are either not application-aware or assume that each flow has its own private channel to ensure deadlock avoidance. We present a framework for application-aware routing that assures deadlock-freedom under one or more channels by forcing routes to conform to an acyclic channel dependence graph. Arbitrary minimal routes can be made deadlock-free through appropriate static channel allocation when two or more channels are available. Given bandwidth estimates for flows, we present a mixed integer-linear programming (MILP) approach and a heuristic approach for producing deadlock-free routes that minimize maximum channel load. The heuristic algorithm is calibrated using the MILP algorithm and evaluated on a number of benchmarks through detailed network simulation. Our framework can be used to produce application-aware routes that target the minimization of latency, number of flows through a link, bandwidth, or any combination thereof.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "oblivious routing; on-chip interconnection networks; systems-on-chip", } @Article{Jiang:2009:IAR, author = "Nan Jiang and John Kim and William J. Dally", title = "Indirect adaptive routing on large scale interconnection networks", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "220--231", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555783", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Recently proposed high-radix interconnection networks [10] require global adaptive routing to achieve optimum performance. Existing direct adaptive routing methods are slow to sense congestion remote from the source router and hence misroute many packets before such congestion is detected. This paper introduces indirect global adaptive routing (IAR) in which the adaptive routing decision uses information that is not directly available at the source router. We describe four IAR routing methods: credit round trip (CRT) [10], progressive adaptive routing (PAR), piggyback routing (PB), and reservation routing (RES). We evaluate each of these methods on the dragonfly topology under both steady-state and transient loads. Our results show that PB, PAR, and CRT all achieve good performance. PB provides the best absolute performance, with 2-7\% lower latency on steady-state uniform random traffic at 70\% load, while PAR provides the fastest response on transient loads. We also evaluate the implementation costs of the indirect adaptive routing methods and show that PB has the lowest implementation cost requiring", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "dragonfly; interconnection networks; routing", } @Article{Hamilton:2009:ISS, author = "James Hamilton", title = "{Internet}-scale service infrastructure efficiency", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "232--232", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555756", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "High-scale cloud services provide economies of scale of five to ten over small-scale deployments, and are becoming a large part of both enterprise information processing and consumer services. Even very large enterprise IT deployments have quite different cost drivers and optimizations points from internet-scale services. The former are people-dominated from a cost perspective whereas internet-scale service costs are driven by server hardware and infrastructure with people costs fading into the noise at less than 10\%.\par In this talk we inventory where the infrastructure costs are in internet-scale services. We track power distribution from 115KV at the property line through all conversions into the data center tracking the losses to final delivery at semiconductor voltage levels. We track cooling and all the energy conversions from power dissipation through release to the environment outside of the building. Understanding where the costs and inefficiencies lie, we'll look more closely at cooling and overall mechanical system design, server hardware design, and software techniques including graceful degradation mode, power yield management, and resource consumption shaping.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "efficiency; Internet-scale", } @Article{Blundell:2009:IPT, author = "Colin Blundell and Milo M. K. Martin and Thomas F. Wenisch", title = "{InvisiFence}: performance-transparent memory ordering in conventional multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "233--244", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555785", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A multiprocessor's memory consistency model imposes ordering constraints among loads, stores, atomic operations, and memory fences. Even for consistency models that relax ordering among loads and stores, ordering constraints still induce significant performance penalties due to atomic operations and memory ordering fences. Several prior proposals reduce the performance penalty of strongly ordered models using post-retirement speculation, but these designs either (1) maintain speculative state at a per-store granularity, causing storage requirements to grow proportionally to speculation depth, or (2) employ distributed global commit arbitration using unconventional chunk-based invalidation mechanisms. In this paper we propose InvisiFence, an approach for implementing memory ordering based on post-retirement speculation that avoids these concerns. InvisiFence leverages minimalistic mechanisms for post-retirement speculation proposed in other contexts to (1) track speculative state efficiently at block-granularity with dedicated storage requirements independent of speculation depth, (2) provide fast commit by avoiding explicit commit arbitration, and (3) operate under a conventional invalidation-based cache coherence protocol. InvisiFence supports both modes of operation found in prior work: speculating only when necessary to minimize the risk of rollback-inducing violations or speculating continuously to decouple consistency enforcement from the processor core. Overall, InvisiFence requires approximately one kilobyte of additional state to transform a conventional multiprocessor into one that provides performance-transparent memory ordering, fences, and atomic operations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "memory consistency; parallel programming", } @Article{Hilton:2009:DSC, author = "Andrew Hilton and Amir Roth", title = "Decoupled store completion\slash silent deterministic replay: enabling scalable data memory for {CPR\slash CFP} processors", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "245--254", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555786", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "CPR/CFP (Checkpoint Processing and Recovery/Continual Flow Pipeline) support an adaptive instruction window that scales to tolerate last-level cache misses. CPR/CFP scale the register file by aggressively reclaiming the destination registers of many in-flight instructions. However, an analogous mechanism does not exist for stores and loads. As the window expands, CPR/CFP processors must track all in-flight stores and loads to support forwarding and detect memory ordering violations.\par The previously-described SVW (Store Vulnerability Window) and SQIP (Store Queue Index Prediction) schemes provide scalable, non-associative load and store queues, respectively. However, they don't work smoothly in a CPR/CFP context. SVW/SQIP rely on the ability to dynamically stall some loads until a specific older store writes to the cache. Enforcing this serialization in CPR/CFP is expensive if the load and store are in the same checkpoint.\par We introduce two complementary procedures that implement this serialization efficiently. Decoupled Store Completion (DSC) allows stores to write to the cache before the enclosing checkpoint completes execution. Silent Deterministic Replay (SDR) supports mis-speculation recovery in the presence of DSC by replaying loads older than completed stores using values from the load queue. The combination of DSC and SDR enables an SVW/SQIP based CPR/CFP memory system that outperforms previous designs while occupying less area.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "checkpoint processors; load-store queues", } @Article{Zheng:2009:DDB, author = "Hongzhong Zheng and Jiang Lin and Zhao Zhang and Zhichun Zhu", title = "Decoupled {DIMM}: building high-bandwidth memory system using low-speed {DRAM} devices", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "255--266", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555788", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The widespread use of multicore processors has dramatically increased the demands on high bandwidth and large capacity from memory systems. In a conventional DDR2/DDR3 DRAM memory system, the memory bus and DRAM devices run at the same data rate. To improve memory bandwidth, we propose a new memory system design called decoupled DIMM that allows the memory bus to operate at a data rate much higher than that of the DRAM devices. In the design, a synchronization buffer is added to relay data between the slow DRAM devices and the fast memory bus; and memory access scheduling is revised to avoid access conflicts on memory ranks. The design not only improves memory bandwidth beyond what can be supported by current memory devices, but also improves reliability, power efficiency, and cost effectiveness by using relatively slow memory devices. The idea of decoupling, precisely the decoupling of bandwidth match between memory bus and a single rank of devices, can also be applied to other types of memory systems including FB-DIMM.\par Our experimental results show that a decoupled DIMM system of 2667MT/s bus data rate and 1333MT/s device data rate improves the performance of memory-intensive workloads by 51\% on average over a conventional memory system of 1333MT/s data rate. Alternatively, a decoupled DIMM system of 1600MT/s bus data rate and 800MT/s device data rate incurs only 8\% performance loss when compared with a conventional system of 1600MT/s data rate, with 16\% reduction on the memory power consumption and 9\% saving on memory energy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "bandwidth decoupling; decoupled DIMM; DRAM memories", } @Article{Lim:2009:DME, author = "Kevin Lim and Jichuan Chang and Trevor Mudge and Parthasarathy Ranganathan and Steven K. Reinhardt and Thomas F. Wenisch", title = "Disaggregated memory for expansion and sharing in blade servers", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "267--278", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555789", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Analysis of technology and application trends reveals a growing imbalance in the peak compute-to-memory-capacity ratio for future servers. At the same time, the fraction contributed by memory systems to total datacenter costs and power consumption during typical usage is increasing. In response to these trends, this paper re-examines traditional compute-memory co-location on a single system and details the design of a new general-purpose architectural building block-a memory blade-that allows memory to be 'disaggregated' across a system ensemble. This remote memory blade can be used for memory capacity expansion to improve performance and for sharing memory across servers to reduce provisioning and power costs. We use this memory blade building block to propose two new system architecture solutions-(1) page-swapped remote memory at the virtualization layer, and (2) block-access remote memory with support in the coherence hardware-that enable transparent memory expansion and sharing on commodity-based systems. Using simulations of a mix of enterprise benchmarks supplemented with traces from live datacenters, we demonstrate that memory disaggregation can provide substantial performance benefits (on average 10X) in memory constrained environments, while the sharing enabled by our solutions can improve performance-per-dollar by up to 57\% when optimizing memory provisioning across multiple servers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "disaggregated memory; memory blades; memory capacity expansion; power and cost efficiencies", } @Article{Dirik:2009:PPS, author = "Cagdas Dirik and Bruce Jacob", title = "The performance of {PC} solid-state disks {(SSDs)} as a function of bandwidth, concurrency, device architecture, and system organization", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "279--289", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555790", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As their prices decline, their storage capacities increase, and their endurance improves, NAND Flash Solid State Disks (SSD) provide an increasingly attractive alternative to Hard Disk Drives (HDD) for portable computing systems and PCs. This paper presents a study of NAND Flash SSD architectures and their management techniques, quantifying SSD performance under user-driven/PC applications in a multi-tasked environment; user activity represents typical PC workloads and includes browsing files and folders, emailing, text editing and document creation, surfing the web, listening to music and playing movies, editing large pictures, and running office applications.\par We find the following: (a) the real limitation to NAND Flash memory performance is not its low per-device bandwidth but its internal core interface; (b) NAND Flash memory media transfer rates do not need to scale up to those of HDDs for good performance; (c) SSD organizations that exploit concurrency at both the system and device level (e.g. RAID-like organizations and Micron-style (superblocks) improve performance significantly); and (d) these system- and device-level concurrency mechanisms are, to a significant degree, orthogonal: that is, the performance increase due to one does not come at the expense of the other, as each exploits a different facet of concurrency exhibited within the PC workload.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "flash memory; performance; solid state disks; storage systems", } @Article{Bhattacharjee:2009:TCP, author = "Abhishek Bhattacharjee and Margaret Martonosi", title = "Thread criticality predictors for dynamic performance, power, and resource management in chip multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "290--301", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555792", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With the shift towards chip multiprocessors (CMPs), exploiting and managing parallelism has become a central problem in computing systems. Many issues of parallelism management boil down to discerning which running threads or processes are critical, or slowest, versus which are non-critical. If one can accurately predict critical threads in a parallel program, then one can respond in a variety of ways. Possibilities include running the critical thread at a faster clock rate, performing load balancing techniques to offload work onto currently non-critical threads, or giving the critical thread more on-chip resources to execute faster.\par This paper proposes and evaluates simple but effective thread criticality predictors for parallel applications. We show that accurate predictors can be built using counters that are typically already available on-chip. Our predictor, based on memory hierarchy statistics, identifies thread criticality with an average accuracy of 93\% across a range of architectures.\par We also demonstrate two applications of our predictor. First, we show how Intel's Threading Building Blocks (TBB) parallel runtime system can benefit from task stealing techniques that use our criticality predictor to reduce load imbalance. Using criticality prediction to guide TBB's task-stealing decisions improves performance by 13-32\% for TBB-based PARSEC benchmarks running on a 32-core CMP. As a second application, criticality prediction guides dynamic energy optimizations in barrier-based applications. By running the predicted critical thread at the full clock rate and frequency-scaling non-critical threads, this approach achieves average energy savings of 15\% while negligibly degrading performance for SPLASH-2 and PARSEC benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "caches; DVFS; Intel TBB; parallel processing; thread criticality prediction", } @Article{Rangan:2009:TMF, author = "Krishna K. Rangan and Gu-Yeon Wei and David Brooks", title = "Thread motion: fine-grained power management for multi-core systems", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "302--313", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555793", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Dynamic voltage and frequency scaling (DVFS) is a commonly-used power-management scheme that dynamically adjusts power and performance to the time-varying needs of running programs. Unfortunately, conventional DVFS, relying on off-chip regulators, faces limitations in terms of temporal granularity and high costs when considered for future multi-core systems. To overcome these challenges, this paper presents thread motion (TM), a fine-grained power-management scheme for chip multiprocessors (CMPs). Instead of incurring the high cost of changing the voltage and frequency of different cores, TM enables rapid movement of threads to adapt the time-varying computing needs of running applications to a mixture of cores with fixed but different power/performance levels. Results show that for the same power budget, two voltage/frequency levels are sufficient to provide performance gains commensurate to idealized scenarios using per-core voltage control. Thread motion extends workload-based power management into the nanosecond realm and, for a given power budget, provides up to 20\% better performance than coarse-grained DVFS.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "DVFS; multi-core power management; thread motion", } @Article{Wang:2009:TCP, author = "Yefu Wang and Kai Ma and Xiaorui Wang", title = "Temperature-constrained power control for chip multiprocessors with online model estimation", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "314--324", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555794", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As chip multiprocessors (CMP) become the main trend in processor development, various power and thermal management strategies have recently been proposed to optimize system performance while controlling the power or temperature of a CMP chip to stay below a constraint. The availability of per-core DVFS (dynamic voltage and frequency scaling) also makes it possible to develop advanced management strategies. However, most existing solutions rely on open-loop search or optimization with the assumption that power can be estimated accurately, while others adopt oversimplified feedback control strategies to control power and temperature separately, without any theoretical guarantees. In this paper, we propose a chip-level power control algorithm that is systematically designed based on optimal control theory. Our algorithm can precisely control the power of a CMP chip to the desired set point while maintaining the temperature of each core below a specified threshold. Furthermore, an online model estimator is designed to achieve analytical assurance of control accuracy and system stability, even in the face of significant workload variations or unpredictable chip or core variations. Empirical results on a physical testbed show that our controller outperforms two state-of-the-art control algorithms by having better SPEC benchmark performance and more precise power control. In addition, extensive simulation results demonstrate the efficacy of our algorithm for various CMP configurations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "chip multiprocessor; feedback control; power management", } @Article{Yu:2009:CIC, author = "Jie Yu and Satish Narayanasamy", title = "A case for an interleaving constrained shared-memory multi-processor", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "325--336", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555796", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Shared-memory multi-threaded programming is inherently more difficult than single-threaded programming. The main source of complexity is that, the threads of an application can interleave in so many different ways. To ensure correctness, a programmer has to test all possible thread interleavings, which, however, is impractical.\par Many rare thread interleavings remain untested in production systems, and they are the root cause for a majority of concurrency bugs. We propose a shared-memory multi-processor design that avoids untested interleavings to improve the correctness of a multi-threaded program. Since untested interleavings tend to occur infrequently at runtime, the performance cost of avoiding them is not high.\par We propose to encode the set of tested correct interleavings in a program's binary executable using {\em Predecessor Set (PSet)\/} constraints. These constraints are efficiently enforced at runtime using processor support, which ensures that the runtime follows a tested interleaving. We analyze several bugs in open source applications such as MySQL, Apache, Mozilla, etc., and show that, by enforcing PSet constraints, we can avoid not only data races and atomicity violations, but also other forms of concurrency bugs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "concurrency bugs; multiprocessors; parallel programming; software reliability", } @Article{Muzahid:2009:SSB, author = "Abdullah Muzahid and Dario Su{\'a}rez and Shanxiang Qi and Josep Torrellas", title = "{SigRace}: signature-based data race detection", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "337--348", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555797", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Detecting data races in parallel programs is important for both software development and production-run diagnosis. Recently, there have been several proposals for hardware-assisted data race detection. Such proposals typically modify the L1 cache and cache coherence protocol messages, and largely lose their capability when lines get displaced or invalidated from the cache. To eliminate these shortcomings, this paper proposes a novel, different approach to hardware-assisted data race detection. The approach, called SigRace, relies on hardware address signatures. As a processor runs, the addresses of the data that it accesses are automatically encoded in signatures. At certain times, the signatures are automatically passed to a hardware module that intersects them with those of other processors. If the intersection is not null, a data race may have occurred.\par This paper presents the architecture of SigRace, an implementation, and its software interface. With SigRace, caches and coherence protocol messages are unmodified. Moreover, cache lines can be displaced and invalidated with no effect. Our experiments show that SigRace is significantly more effective than a state-of-the-art conventional hardware-assisted race detector. SigRace finds on average 29\% more static races and 107\% more dynamic races. Moreover, if we inject data races, SigRace finds 150\% more static races than the conventional scheme.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "concurrency defect; data race; happened-before; signature; SigRace; timestamp", } @Article{Nagarajan:2009:EEC, author = "Vijay Nagarajan and Rajiv Gupta", title = "{ECMon}: exposing cache events for monitoring", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "349--360", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555798", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The advent of multicores has introduced new challenges for programmers to provide increased performance and software reliability. There has been significant interest in techniques that use software speculation to better utilize the computational power of multicores. At the same time, several recent proposals for ensuring software reliability are not applicable in a multicore setting due to their inability to handle interprocessor shared memory dependences (ISMDs). The demands for performing speculation and ensuring software reliability in a multicore setting, although seemingly different, share a common requirement: the need for monitoring program execution and collecting interprocessor dependence information at low overhead. For example, an important component of speculation is the efficient detection of misspeculation which in turn requires dependence information. Likewise, tasks that help ensure software reliability on multicores, including {\em recording for replay}, require ISMD information.\par In this paper, we propose {\em ECMon:\/} support for exposing cache events to the software. This enables the programmer to catch these events and react to them; in effect, efficiently exposing the ISMDs to the programmer. In the context of speculation, we show how {\em ECMon\/} optimizes the detection of miss-speculation; we use this simple support to speculate past active barriers and achieve a speedup of 12\% for the set of parallel programs considered. As an application of ensuring software reliability, we show how {\em ECMon\/} can be used to record shared memory dependences on multicores using no specialized hardware support at only 2.8 fold execution time overhead.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cache events; recording for replay; speculation past barriers", } @Article{Saidi:2009:EEP, author = "Ali G. Saidi and Nathan L. Binkert and Steven K. Reinhardt and Trevor Mudge", title = "End-to-end performance forecasting: finding bottlenecks before they happen", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "361--370", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555800", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Many important workloads today, such as web-hosted services, are limited not by processor core performance but by interactions among the cores, the memory system, I/O devices, and the complex software layers that tie these components together. Architects designing future systems for these workloads are challenged to identify performance bottlenecks because, as in any concurrent system, overheads in one component may be hidden due to overlap with other operations. These overlaps span the user/kernel and software/hardware boundaries, making traditional performance analysis techniques inadequate.\par We present a methodology for identifying end-to-end critical paths across software and simulated hardware in complex networked systems. By modeling systems as collections of state machines interacting via queues, we can trace critical paths through multiplexed processing engines, identify when resources create bottlenecks (including abstract resources such as flow-control credits), and predict the benefit of eliminating bottlenecks by increasing hardware speeds or expanding available resources.\par We implement our technique in a full-system simulator and analyze a TCP microbenchmark, a web server, the Linux TCP/IP stack, and an Ethernet controller. From a single run of the microbenchmark, our tool--within minutes--correctly identifies a series of bottlenecks, and predicts the performance of hypothetical systems in which these bottlenecks are successively eliminated, culminating in a total speedup of 3X. We then validate these predictions through hours of additional simulation, and find them to be accurate within 1--17\%. We also analyze the web server, find it to be CPU-bound, and predict the performance of a system with an additional core within 6\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "critical path analysis; performance analysis", } @Article{Rogers:2009:SBW, author = "Brian M. Rogers and Anil Krishna and Gordon B. Bell and Ken Vu and Xiaowei Jiang and Yan Solihin", title = "Scaling the bandwidth wall: challenges in and avenues for {CMP} scaling", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "371--382", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555801", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As transistor density continues to grow at an exponential rate in accordance to Moore's law, the goal for many Chip Multi-Processor (CMP) systems is to scale the number of on-chip cores proportionally. Unfortunately, off-chip memory bandwidth capacity is projected to grow slowly compared to the desired growth in the number of cores. This creates a situation in which each core will have a decreasing amount of off-chip bandwidth that it can use to load its data from off-chip memory. The situation in which off-chip bandwidth is becoming a performance and throughput bottleneck is referred to as the {\em bandwidth wall\/} problem.\par In this study, we seek to answer two questions: (1) to what extent does the bandwidth wall problem restrict future multicore scaling, and (2) to what extent are various bandwidth conservation techniques able to mitigate this problem. To address them, we develop a simple but powerful analytical model to predict the number of on-chip cores that a CMP can support given a limited growth in memory traffic capacity. We find that the bandwidth wall can severely limit core scaling. When starting with a balanced 8-core CMP, in four technology generations the number of cores can only scale to 24, as opposed to 128 cores under proportional scaling, without increasing the memory traffic requirement. We find that various individual bandwidth conservation techniques we evaluate have a wide ranging impact on core scaling, and when combined together, these techniques have the potential to enable super-proportional core scaling for up to 4 technology generations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "analytical model; chip multi-processor; memory bandwidth", } @Article{Whitney:2009:FTA, author = "Mark G. Whitney and Nemanja Isailovic and Yatish Patel and John Kubiatowicz", title = "A fault tolerant, area efficient architecture for {Shor}'s factoring algorithm", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "383--394", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555802", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We optimize the area and latency of Shor's factoring while simultaneously improving fault tolerance through: (1) balancing the use of ancilla generators, (2) aggressive optimization of error correction, and (3) tuning the core adder circuits. Our custom CAD flow produces detailed layouts of the physical components and utilizes simulation to analyze circuits in terms of area, latency, and success probability. We introduce a metric, called ADCR, which is the probabilistic equivalent of the classic Area-Delay product. Our error correction optimization can reduce ADCR by order of magnitude or more. Contrary to conventional wisdom, we show that the area of an optimized quantum circuit is {\em not\/} dominated exclusively by error\par correction. Further, our adder evaluation shows that quantum carry-lookahead adders (QCLA) beat ripple-carry adders in ADCR, despite being larger and more complex. We conclude with what we believe is one of most accurate estimates of the area and latency required for 1024-bit Shor's factorization: 7659 mm$^2$ for the smallest circuit and 6 x 10$^8$ seconds for the fastest circuit.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "CAD; control; ion trap; layout; quantum computing", } @Article{Putnam:2009:PPC, author = "Andrew Putnam and Susan Eggers and Dave Bennett and Eric Dellinger and Jeff Mason and Henry Styles and Prasanna Sundararajan and Ralph Wittig", title = "Performance and power of cache-based reconfigurable computing", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "395--405", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555804", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Many-cache is a memory architecture that efficiently supports caching in commercially available FPGAs. It facilitates FPGA programming for high-performance computing (HPC) developers by providing them with memory performance that is greater and power consumption that is less than their current CPU platforms, but without sacrificing their familiar, C-based programming environment.\par Many-cache creates multiple, multi-banked caches on top of an FGPA's small, independent memories, each targeting a particular data structure or region of memory in an application and each customized for the memory operations that access it. The caches are automatically generated from C source by the CHiMPS C-to-FPGA compiler.\par This paper presents the analyses and optimizations of the CHiMPS compiler that construct many-cache caches. An architectural evaluation of CHiMPS-generated FPGAs demonstrates a performance advantage of 7.8x (geometric mean) over CPU-only execution of the same source code, FPGA power usage that is on average 4.1x less, and consequently performance per watt that is also greater, by a geometric mean of 21.3x.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "C-to-gates; C-to-hardware; caches; co-processor accelerator; FPGA; many-cache; synthesis compiler", } @Article{Firoozshahian:2009:MSD, author = "Amin Firoozshahian and Alex Solomatnikov and Ofer Shacham and Zain Asgar and Stephen Richardson and Christos Kozyrakis and Mark Horowitz", title = "A memory system design framework: creating smart memories", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "406--417", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555805", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As CPU cores become building blocks, we see a great expansion in the types of on-chip memory systems proposed for CMPs. Unfortunately, designing the cache and protocol controllers to support these memory systems is complex, and their concurrency and latency characteristics significantly affect the performance of any CMP. To address this problem, this paper presents a microarchitecture framework for cache and protocol controllers, which can aid in generating the RTL for new memory systems. The framework consists of three pipelined engines' request-tracking, state-manipulation, and data movement' which are programmed to implement a higher-level memory model. This approach simplifies the design and verification of CMP systems by decomposing the memory model into sequences of state and data manipulations. Moreover, implementing the framework itself produces a polymorphic memory system.\par To validate the approach, we implemented a scalable, flexible CMP in silicon. The memory system was then programmed to support three disparate memory models' cache coherent shared memory, streams and transactional memory. Measured overheads of this approach seem promising. Our system generates controllers with performance overheads of less than 20\% compared to an ideal controller with zero internal latency. Even the overhead of directly implementing a fully programmable controller was modest. While it did double the controller's area, the amortized effective area in the system grew by roughly 7\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cache coherence; memory access protocol; memory systems; multi-core processors; protocol controller; reconfigurable architecture; stream programming; transactional memory", } @Article{Joao:2009:FRC, author = "Jos{\'e} A. Joao and Onur Mutlu and Yale N. Patt", title = "Flexible reference-counting-based hardware acceleration for garbage collection", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "418--428", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555806", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Languages featuring automatic memory management (garbage collection) are increasingly used to write all kinds of applications because they provide clear software engineering and security advantages. Unfortunately, garbage collection imposes a toll on performance and introduces pause times, making such languages less attractive for high-performance or real-time applications. Much progress has been made over the last five decades to reduce the overhead of garbage collection, but it remains significant.\par We propose a cooperative hardware-software technique to reduce the performance overhead of garbage collection. The key idea is to reduce the frequency of garbage collection by efficiently detecting and reusing dead memory space in hardware via hardware-implemented reference counting. Thus, even though software garbage collections are still eventually needed, they become much less frequent and have less impact on overall performance. Our technique is compatible with a variety of software garbage collection algorithms, does not break compatibility with existing software, and reduces garbage collection time by 31\% on average on the Java DaCapo benchmarks running on the production build of the Jikes RVM, which uses a state-of-the-art generational garbage collector.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "garbage collection; reference counting", } @Article{Pan:2009:FIF, author = "Yan Pan and Prabhat Kumar and John Kim and Gokhan Memik and Yu Zhang and Alok Choudhary", title = "{Firefly}: illuminating future network-on-chip with nanophotonics", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "429--440", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555808", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Future many-core processors will require high-performance yet energy-efficient on-chip networks to provide a communication substrate for the increasing number of cores. Recent advances in silicon nanophotonics create new opportunities for on-chip networks. To efficiently exploit the benefits of nanophotonics, we propose Firefly - a hybrid, hierarchical network architecture. Firefly consists of clusters of nodes that are connected using conventional, electrical signaling while the inter-cluster communication is done using nanophotonics - exploiting the benefits of electrical signaling for short, local communication while nanophotonics is used only for global communication to realize an efficient on-chip network. Crossbar architecture is used for inter-cluster communication. However, to avoid global arbitration, the crossbar is partitioned into multiple, logical crossbars and their arbitration is localized. Our evaluations show that Firefly improves the performance by up to 57\% compared to an all-electrical concentrated mesh (CMESH) topology on adversarial traffic patterns and up to 54\% compared to an all-optical crossbar (OP XBAR) on traffic patterns with locality. If the energy-delay-product is compared, Firefly improves the efficiency of the on-chip network by up to 51\% and 38\% compared to CMESH and OP XBAR, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "hierarchical network; interconnection networks; nanophotonics; topology", } @Article{Cianchetti:2009:PRT, author = "Mark J. Cianchetti and Joseph C. Kerekes and David H. Albonesi", title = "{Phastlane}: a rapid transit optical routing network", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "441--450", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555809", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Tens and eventually hundreds of processing cores are projected to be integrated onto future microprocessors, making the global interconnect a key component to achieving scalable chip performance within a given power envelope. While CMOS-compatible nanophotonics has emerged as a leading candidate for replacing global wires beyond the 22nm timeframe, on-chip optical interconnect architectures proposed thus far are either limited in scalability or are dependent on comparatively slow electrical control networks.\par In this paper, we present Phastlane, a hybrid electrical/optical routing network for future large scale, cache coherent multicore microprocessors. The heart of the Phastlane network is a low-latency optical crossbar that uses simple predecoded source routing to transmit cache-line-sized packets several hops in a single clock cycle under contentionless conditions. When contention exists, the router makes use of electrical buffers and, if necessary, a high speed drop signaling network. Overall, Phastlane achieve 2X better network performance than a state-of-the-art electrical baseline while consuming 80\% less network power.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "interconnection networks; multicore; nanophotonics; optical interconnects", } @Article{Abts:2009:APP, author = "Dennis Abts and Natalie D. Enright Jerger and John Kim and Dan Gibson and Mikko H. Lipasti", title = "Achieving predictable performance through better memory controller placement in many-core {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "451--461", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555810", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In the near term, Moore's law will continue to provide an increasing number of transistors and therefore an increasing number of on-chip cores. Limited pin bandwidth prevents the integration of a large number of memory controllers on-chip. With many cores, and few memory controllers, where to locate the memory controllers in the on-chip interconnection fabric becomes an important and as yet unexplored question. In this paper we show how the location of the memory controllers can reduce contention (hot spots) in the on-chip fabric and lower the variance in reference latency. This in turn provides predictable performance for memory-intensive applications regardless of the processing core on which a thread is scheduled. We explore the design space of on-chip fabrics to find optimal memory controller placement relative to different topologies (i.e. mesh and torus), routing algorithms, and workloads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "chip multiprocessors; interconnection networks; memory controllers; routing algorithms", } @Article{Luo:2009:DPT, author = "Yangchun Luo and Venkatesan Packirisamy and Wei-Chung Hsu and Antonia Zhai and Nikhil Mungre and Ankit Tarkas", title = "Dynamic performance tuning for speculative threads", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "462--473", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555812", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In response to the emergence of multicore processors, various novel and sophisticated execution models have been introduced to fully utilize these processors. One such execution model is Thread-Level Speculation (TLS), which allows potentially dependent threads to execute speculatively in parallel. While TLS offers significant performance potential for applications that are otherwise non-parallel, extracting efficient speculative threads in the presence of complex control flow and ambiguous data dependences is a real challenge. This task is further complicated by the fact that the performance of speculative threads is often architecture-dependent, input-sensitive, and exhibits phase behaviors. Thus we propose dynamic performance tuning mechanisms that determine where and how to create speculative threads at runtime.\par This paper describes the design, implementation, and evaluation of hardware and software support that takes advantage of runtime performance profiles to extract efficient speculative threads. In our proposed framework, speculative threads are monitored by hardware-based performance counters and their performance impact is estimated. The creation of speculative threads is adjusted based on the estimation. This paper proposes speculative threads performance estimation techniques, that are capable of correctly determining whether speculation can improve performance for loops that corresponds to 83.8\% of total loop execution time across all benchmarks. This paper also examines several dynamic performance tuning policies and finds that the best tuning policy achieves an overall speedup of 36.8\%on a set of benchmarks from SPEC2000 suite, which outperforms static thread management by 9.5\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "dynamic optimization; multicore; parallelism; thread-level speculation", } @Article{Madriles:2009:BST, author = "Carlos Madriles and Pedro L{\'o}pez and Josep M. Codina and Enric Gibert and Fernando Latorre and Alejandro Martinez and Ra{\'u}l Martinez and Antonio Gonzalez", title = "Boosting single-thread performance in multi-core systems through fine-grain multi-threading", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "474--483", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555813", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Industry has shifted towards multi-core designs as we have hit the memory and power walls. However, single thread performance remains of paramount importance since some applications have limited thread-level parallelism (TLP), and even a small part with limited TLP impose important constraints to the global performance, as explained by Amdahl's law.\par In this paper we propose a novel approach for leveraging multiple cores to improve single-thread performance in a multi-core design. The proposed technique features a set of novel hardware mechanisms that support the execution of threads generated at compile time. These threads result from a fine-grain speculative decomposition of the original application and they are executed under a modified multi-core system that includes: (1) mechanisms to support multiple versions; (2) mechanisms to detect violations among threads; (3) mechanisms to reconstruct the original sequential order; and (4) mechanisms to checkpoint the architectural state and recovery to handle misspeculations.\par The proposed scheme outperforms previous hardware-only schemes to implement the idea of combining cores for executing single-thread applications in a multi-core design by more than 10\% on average on Spec2006 for all configurations. Moreover, single-thread performance is improved by 41\% on average when the proposed scheme is used on a Tiny Core, and up to 2.6x for some selected applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "automatic parallelization; core-fusion; multicore; single-thread performance; speculative multithreading; thread-level parallelism", } @Article{Chaudhry:2009:SST, author = "Shailender Chaudhry and Robert Cypher and Magnus Ekman and Martin Karlsson and Anders Landin and Sherman Yip and H{\aa}kan Zeffer and Marc Tremblay", title = "Simultaneous speculative threading: a novel pipeline architecture implemented in {Sun}'s {Rock} processor", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "484--495", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555814", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents Simultaneous Speculative Threading (SST), which is a technique for creating high-performance area- and power-efficient cores for chip multiprocessors. SST hardware dynamically extracts two threads of execution from a single sequential program (one consisting of a load miss and its dependents, and the other consisting of the instructions that are independent of the load miss) and executes them in parallel. SST uses an efficient checkpointing mechanism to eliminate the need for complex and power-inefficient structures such as register renaming logic, reorder buffers, memory disambiguation buffers, and large issue windows. Simulations of certain SST implementations show 18\% better per-thread performance on commercial benchmarks than larger and higher-powered out-of-order cores. Sun Microsystems' ROCK processor, which is the first processor to use SST cores, has been implemented and is scheduled to be commercially available in 2009.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "checkpoint-based architecture; chip multiprocessor; CMP; hardware speculation; instruction-level parallelism; memory-level parallelism; processor architecture; SST", } @Article{Thomasian:2009:PSS, author = "Alexander Thomasian", title = "Publications on storage and systems research", journal = j-COMP-ARCH-NEWS, volume = "37", number = "4", pages = "1--26", month = sep, year = "2009", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Mar 15 19:03:39 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Musoll:2009:MBM, author = "Enric Musoll", title = "Mesh-based many-core performance under process variations: a core yield perspective", journal = j-COMP-ARCH-NEWS, volume = "37", number = "4", pages = "27--34", month = sep, year = "2009", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Mar 15 19:03:39 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nikolov:2009:QTM, author = "Angel V. Nikolov", title = "Queuing theoretic model for a multiprocessor with private caches and shared memory", journal = j-COMP-ARCH-NEWS, volume = "37", number = "4", pages = "35--44", month = sep, year = "2009", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Mar 15 19:03:39 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2009:INb, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "37", number = "4", pages = "45--51", month = sep, year = "2009", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Mon Mar 15 19:03:39 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Musoll:2009:LSO, author = "Enric Musoll", title = "Leakage-saving opportunities in mesh-based massive multi-core architectures", journal = j-COMP-ARCH-NEWS, volume = "37", number = "5", pages = "1--7", month = dec, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1755235.1755237", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Apr 8 18:42:25 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "When processing multi-threaded workloads requiring significant inter-thread communication, opportunities to reduce power consumption arise due to the large latencies in obtaining data from the threads running on remote cores and the lack of architectural resources implemented in the simple cores to cover these latencies.\par In this work we propose to use the drowsy mode technique to save leakage power on the cores and leverage the mesh-based communication fabric to hide the wake-up latency of the core blocks. We have observed a potential for reducing the overall power of around 70\% in a generic homogeneous 256-core tile-based multi-core architecture.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Naeem:2009:SRC, author = "Abdul Naeem and Xiaowen Chen and Zhonghai Lu and Axel Jantsch", title = "Scalability of relaxed consistency models in {NoC} based multicore architectures", journal = j-COMP-ARCH-NEWS, volume = "37", number = "5", pages = "8--15", month = dec, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1755235.1755238", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Apr 8 18:42:25 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper studies realization of relaxed memory consistency models in the network-on-chip based distributed shared memory (DSM) multi-core systems. Within DSM systems, memory consistency is a critical issue since it affects not only the performance but also the correctness of programs. We investigate the scalability of the relaxed consistency models (weak, release consistency) implemented by using transaction counters. Our experimental results compare the average and maximum code, synchronization and data latencies of the two consistency models for various network sizes with regular mesh topologies. The observed latencies rise for both the consistency models as the network size grows. However, the scaling behaviors are different. With the release consistency model these latencies grow significantly slower than with the weak consistency due to better optimization potential by means of overlapping, reordering and program order relaxations. The release consistency improves the performance by 15.6\% and 26.5\% on average in the code and consistency latencies over the weak consistency model for the specific application, as the system grows from single core to 64 cores. The latency of data transactions grows 2.2 times faster on the average with a weak consistency model than with a release consistency model when the system scales from single core to 64 core", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "distributed shared memory; memory consistency; scalability; synchronization", } @Article{Sharma:2009:RPL, author = "Sandeep Sharma and K. S. Kahlon and P. K. Bansal", title = "Reliability and path length analysis of irregular fault tolerant multistage interconnection network", journal = j-COMP-ARCH-NEWS, volume = "37", number = "5", pages = "16--23", month = dec, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1755235.1755239", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Apr 8 18:42:25 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper reliability and path length analysis of irregular Multistage Interconnection Networks have been presented. We have examined FT(Four Tree)[8],MFT(Modified Four Tree)[2],NFT(New Four Tree)[4],IFT(improved Four Tree)[5],IASN(Irregular Augmented Shuffle)[14] and IIASN(Improved Irregular Augmented Shuffle)[3] networks in which the number of switches in each stage are different in numbers and also have express links[11]. Using upper and lower bounds[7][13][15] for larger networks, the reliability[9] in terms of mean time to failure of all these networks are evaluated and compared with each other. Each source is connected to destination with one or multiple paths with varying path lengths in a network. The path length analysis of all these networks is also analyzed in this paper. A path length[8] algorithm for IIASN network is also propose", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "four tree network; IIASN; multistage interconnection network; network reliability; NFT; path length; upper bound reliability", } @Article{Thorson:2009:INc, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "37", number = "5", pages = "24--30", month = dec, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1755235.1755241", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Apr 8 18:42:25 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Brewer:2010:TDR, author = "Eric A. Brewer", title = "Technology for developing regions: {Moore's Law} is not enough", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "1--2", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ipek:2010:DRM, author = "Engin Ipek and Jeremy Condit and Edmund B. Nightingale and Doug Burger and Thomas Moscibroda", title = "Dynamically replicated memory: building reliable systems from nanoscale resistive memories", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "3--14", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kirman:2010:PEA, author = "Nevin Kirman and Jos{\'e} F. Mart{\'\i}nez", title = "A power-efficient all-optical on-chip interconnect using wavelength-based oblivious routing", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "15--28", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Neelakantam:2010:RSE, author = "Naveen Neelakantam and David R. Ditzel and Craig Zilles", title = "A real system evaluation of hardware atomicity for software speculation", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "29--38", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Harris:2010:DFM, author = "Tim Harris and Sasa Tomic and Adri{\'a}n Cristal and Osman Unsal", title = "Dynamic filtering: multi-purpose architecture support for language runtime systems", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "39--52", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bergan:2010:CCR, author = "Tom Bergan and Owen Anderson and Joseph Devietti and Luis Ceze and Dan Grossman", title = "{CoreDet}: a compiler and runtime system for deterministic multithreaded execution", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "53--64", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Raman:2010:SPU, author = "Arun Raman and Hanjun Kim and Thomas R. Mason and Thomas B. Jablin and David I. August", title = "Speculative parallelization using software multi-threaded transactions", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "65--76", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:2010:REO, author = "Dongyoon Lee and Benjamin Wester and Kaushik Veeraraghavan and Satish Narayanasamy and Peter M. Chen and Jason Flinn", title = "{Respec}: efficient online multiprocessor replay via speculation and external determinism", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "77--90", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Eyerman:2010:PJS, author = "Stijn Eyerman and Lieven Eeckhout", title = "Probabilistic job symbiosis modeling for {SMT} processor scheduling", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "91--102", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shen:2010:RBV, author = "Kai Shen", title = "Request behavior variations", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "103--116", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Johnson:2010:DCM, author = "F. Ryan Johnson and Radu Stoica and Anastasia Ailamaki and Todd C. Mowry", title = "Decoupling contention management from scheduling", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "117--128", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhuravlev:2010:ASR, author = "Sergey Zhuravlev and Sergey Blagodurov and Alexandra Fedorova", title = "Addressing shared resource contention in multicore processors via scheduling", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "129--142", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yuan:2010:SED, author = "Ding Yuan and Haohui Mai and Weiwei Xiong and Lin Tan and Yuanyuan Zhou and Shankar Pasupathy", title = "{SherLog}: error diagnosis by connecting clues from run-time logs", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "143--154", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Weeratunge:2010:AMD, author = "Dasarath Weeratunge and Xiangyu Zhang and Suresh Jagannathan", title = "Analyzing multicore dumps to facilitate concurrency bug reproduction", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "155--166", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Burckhardt:2010:RSP, author = "Sebastian Burckhardt and Pravesh Kothari and Madanlal Musuvathi and Santosh Nagarakatte", title = "A randomized scheduler with probabilistic guarantees of finding bugs", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "167--178", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhang:2010:CDS, author = "Wei Zhang and Chong Sun and Shan Lu", title = "{ConMem}: detecting severe concurrency bugs through an effect-oriented approach", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "179--192", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mesa-Martinez:2010:CPT, author = "Francisco Javier Mesa-Martinez and Ehsan K. Ardestani and Jose Renau", title = "Characterizing processor thermal behavior", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "193--204", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Venkatesh:2010:CCR, author = "Ganesh Venkatesh and Jack Sampson and Nathan Goulding and Saturnino Garcia and Vladyslav Bryksin and Jose Lugo-Martinez and Steven Swanson and Michael Bedford Taylor", title = "Conservation cores: reducing the energy of mature computations", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "205--218", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sudan:2010:MPI, author = "Kshitij Sudan and Niladrish Chatterjee and David Nellans and Manu Awasthi and Rajeev Balasubramonian and Al Davis", title = "Micro-pages: increasing {DRAM} efficiency with locality-aware data placement", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "219--230", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pelley:2010:PRD, author = "Steven Pelley and David Meisner and Pooya Zandevakili and Thomas F. Wenisch and Jack Underwood", title = "Power routing: dynamic power provisioning in the data center", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "231--242", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ahmad:2010:JOI, author = "Faraz Ahmad and T. N. Vijaykumar", title = "Joint optimization of idle and cooling power in data centers while maintaining response time", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "243--256", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Goodstein:2010:BAA, author = "Michelle L. Goodstein and Evangelos Vlachos and Shimin Chen and Phillip B. Gibbons and Michael A. Kozuch and Todd C. Mowry", title = "Butterfly analysis: adapting dataflow analysis to dynamic parallel monitoring", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "257--270", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vlachos:2010:PEA, author = "Evangelos Vlachos and Michelle L. Goodstein and Michael A. Kozuch and Shimin Chen and Babak Falsafi and Phillip B. Gibbons and Todd C. Mowry", title = "{ParaLog}: enabling and accelerating online parallel monitoring of multithreaded applications", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "271--284", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hormati:2010:MMS, author = "Amir H. Hormati and Yoonseo Choi and Mark Woh and Manjunath Kudlur and Rodric Rabbah and Trevor Mudge and Scott Mahlke", title = "{MacroSS}: macro-{SIMDization} of streaming applications", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "285--296", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Woo:2010:CPD, author = "Dong Hyuk Woo and Hsien-Hsin S. Lee", title = "{COMPASS}: a programmable data prefetcher using idle {GPU} shaders", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "297--310", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sanchez:2010:FAS, author = "Daniel Sanchez and Richard M. Yoo and Christos Kozyrakis", title = "Flexible architectural support for fine-grain scheduling", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "311--322", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Romanescu:2010:SDV, author = "Bogdan F. Romanescu and Alvin R. Lebeck and Daniel J. Sorin", title = "Specifying and dynamically verifying address translation-aware memory consistency", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "323--334", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ebrahimi:2010:FST, author = "Eiman Ebrahimi and Chang Joo Lee and Onur Mutlu and Yale N. Patt", title = "Fairness via source throttling: a configurable and high-performance fairness substrate for multi-core memory systems", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "335--346", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gelado:2010:ADS, author = "Isaac Gelado and Javier Cabezas and Nacho Navarro and John E. Stone and Sanjay Patel and Wen-mei W. Hwu", title = "An asymmetric distributed shared memory model for heterogeneous parallel systems", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "347--358", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bhattacharjee:2010:ICC, author = "Abhishek Bhattacharjee and Margaret Martonosi", title = "Inter-core cooperative {TLB} for chip multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "359--370", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Huang:2010:OES, author = "Ruirui Huang and Daniel Y. Deng and G. Edward Suh", title = "Orthrus: efficient software integrity protection on multi-cores", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "371--384", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Feng:2010:SPS, author = "Shuguang Feng and Shantanu Gupta and Amin Ansari and Scott Mahlke", title = "Shoestring: probabilistic soft error reliability on the cheap", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "385--396", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yoon:2010:VFE, author = "Doe Hyun Yoon and Mattan Erez", title = "Virtualized and flexible {ECC} for main memory", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "397--408", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thomasian:2010:SRI, author = "Alexander Thomasian", title = "Storage research in industry and universities", journal = j-COMP-ARCH-NEWS, volume = "38", number = "2", pages = "1--48", month = may, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1823838.1823840", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:38 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We review activities at universities and industrial research centers in the storage area, but also briefly mention topics such as processor design, operating systems, databases, and performance analysis. Our starting point is the Berkeley RAID proposal and the associated taxonomy two decades ago. Important research groups are listed and key researchers are identified. We pay special attention to faculty/student relationships, listing PhD theses and articles related to storage. We also describe innovative storage products and the companies behind them. This paper complements author's 'Publications in Storage and Systems', ACM CAN, Sept. 2009.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Matthes:2010:RIC, author = "Wolfgang Matthes", title = "Resources instead of cores?", journal = j-COMP-ARCH-NEWS, volume = "38", number = "2", pages = "49--63", month = may, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1823838.1823841", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:38 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Mapping conventional applications to multiple cores is a difficult problem. To provide a general solution, it is proposed to abandon the very concept of processor cores and to populate the silicon real estate with less complex control and operation units, designated as resources. A hardware-software API is described that can put into effect a practically unlimited number of such resources and that allows for completely describing and exploiting the inherent parallelism of the application problems. The paper introduces the principles of operation, discusses problems of feasibility and outlines the basic philosophy behind the approach. The proposed principles may lead to:\par * Instruction set architectures which can cope with a transfinite number of hardware resources.\par * Processor circuits containing resources of intermediate granularity and appropriately optimized interconnects.\par * Considerable reduction of power consumption during operation at full speed.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "computer architecture; inherent parallelism; multicore processors; parallel computing; power saving", } @Article{Thorson:2010:INa, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "38", number = "2", pages = "64--67", month = may, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1823838.1823843", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:38 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dally:2010:MNC, author = "William J. Dally", title = "Moving the needle, computer architecture research in academe and industry", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "1--1", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1815963", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The goal of computer architecture research is to move the needle, that is to affect the future of computing in a positive way. Publications, prototypes, and studies are all just different means to this common end. This talk will address how to move the needle in academic and industrial settings discussing what works and what doesn't. Our work is constrained by applications, technology, and commercial reality. The architecture funnel starts with many concepts that proceed through stages of evaluation and refinement. A relatively few successful concepts make it out the far side to deployment. Most concepts fail, and good researchers cut their losses early. The funnel has many years of latency and good researchers aim for results that are relevant beyond this latency. Academics are best at the early stages of the concept funnel -- where their long-term perspective and freedom from constraints are advantages. Industry excels at the later stages of the pipeline where resources and experience are well suited to refining ideas for deployment. Too often good concepts fall into a chasm between the two. Good partnerships are needed to bridge this chasm. This talk will give illustrate this exploration of architecture research with numerous examples of successes and failures. It will give recommended best practices for academic and industrial research. I will close with a glimpse of the future of architecture.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "research", } @Article{Watanabe:2010:WWD, author = "Yasuko Watanabe and John D. Davis and David A. Wood", title = "{WiDGET: Wisconsin Decoupled Grid Execution Tiles}", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "2--13", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1815965", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The recent paradigm shift to multi-core systems results in high system throughput within a specified power budget. However, future systems still require good single thread performance--no longer the predominant design priority--to mitigate sequential bottlenecks and/or to guarantee service-level agreements. Unfortunately, near saturation in voltage scaling necessitates a long-term alternative to dynamic voltage and frequency scaling.\par We propose an energy-proportional computing infrastructure, called WiDGET, that decouples thread context management from a sea of simple execution units (EUs). WiDGET's decoupled design provides flexibility to alter resource allocation for a particular power-performance target while turning off unallocated resources. In other words, WiDGET enables dynamic customization of different combinations of small and/or powerful cores on a single chip, consuming power in proportion to the delivered performance.\par Over all SPEC CPU2006 benchmarks, WiDGET provides average per-thread performance that is 26\% better than a Xeon-like processor while using 8\% less power. WiDGET can also scale down to a level comparable to an Atom-like processor, turning off resources to reduce average power by 58\%. WiDGET achieves high power efficiency (BIPS$^3$ /W), exceeding Xeon-like and Atom-like processors by up to 2x and 21x, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "hardware; instruction steering; performance; power efficiency; power proportional computing", } @Article{Gibson:2010:FSC, author = "Dan Gibson and David A. Wood", title = "{Forwardflow}: a scalable core for power-constrained {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "14--25", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1815966", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Chip Multiprocessors (CMPs) are now commodity hardware, but commoditization of parallel software remains elusive. In the near term, the current trend of increased core-per-socket count will continue, despite a lack of parallel software to exercise the hardware. Future CMPs must deliver thread-level parallelism when software provides threads to run, but must also continue to deliver performance gains for single threads by exploiting instruction-level parallelism and memory-level parallelism. However, power limitations will prevent conventional cores from exploiting both simultaneously.\par This work presents the Forwardflow Architecture, which can scale its execution logic up to run single threads, or down to run multiple threads in a CMP. Forwardflow dynamically builds an explicit internal dataflow representation from a conventional instruction set architecture, using forward dependence pointers to guide instruction wakeup, selection, and issue. Forwardflow's backend is organized into discrete units that can be individually (de-)activated, allowing each core's performance to be scaled by system software at the architectural level.\par On single threads, Forwardflow core scaling yields a mean runtime reduction of 21\% for a 37\% increase in power consumption. For multithreaded workloads, a Forwardflow-based CMP allows system software to select the performance point that best matches available power.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "chip multiprocessor (cmp); power; scalable core", } @Article{Azizi:2010:EPT, author = "Omid Azizi and Aqeel Mahesri and Benjamin C. Lee and Sanjay J. Patel and Mark Horowitz", title = "Energy-performance tradeoffs in processor architecture and circuit design: a marginal cost analysis", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "26--36", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1815967", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Power consumption has become a major constraint in the design of processors today. To optimize a processor for energy-efficiency requires an examination of energy-performance trade-offs in all aspects of the processor design space, including both architectural and circuit design choices. In this paper, we apply an integrated architecture-circuit optimization framework to map out energy-performance trade-offs of several different high-level processor architectures. We show how the joint architecture-circuit space provides a trade-off range of approximately 6.5x in performance for 4x energy, and we identify the optimal architectures for different design objectives. We then show that many of the designs in this space come at very high marginal costs. Our results show that, for a large range of design objectives, voltage scaling is effective in efficiently trading off performance and energy, and that the choice of optimal architecture and circuits does not change much during voltage scaling. Finally, we show that with only two designs--a dual-issue in-order design and a dual-issue out-of-order design, both properly optimized-a large part of the energy-performance trade-off space can be covered within 3\% of the optimal energy-efficiency.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "co-optimization; design space exploration; design trade-offs; energy efficiency; microarchitecture; optimization", } @Article{Hameed:2010:USI, author = "Rehan Hameed and Wajahat Qadeer and Megan Wachs and Omid Azizi and Alex Solomatnikov and Benjamin C. Lee and Stephen Richardson and Christos Kozyrakis and Mark Horowitz", title = "Understanding sources of inefficiency in general-purpose chips", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "37--47", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1815968", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Due to their high volume, general-purpose processors, and now chip multiprocessors (CMPs), are much more cost effective than ASICs, but lag significantly in terms of performance and energy efficiency. This paper explores the sources of these performance and energy overheads in general-purpose processing systems by quantifying the overheads of a 720p HD H.264 encoder running on a general-purpose CMP system. It then explores methods to eliminate these overheads by transforming the CPU into a specialized system for H.264 encoding. We evaluate the gains from customizations useful to broad classes of algorithms, such as SIMD units, as well as those specific to particular computation, such as customized storage and functional units.\par The ASIC is 500x more energy efficient than our original four-processor CMP. Broadly applicable optimizations improve performance by 10x and energy by 7x. However, the very low energy costs of actual core ops (100s fJ in 90nm) mean that over 90\% of the energy used in these solutions is still 'overhead'. Achieving ASIC-like performance and efficiency requires algorithm-specific optimizations. For each sub-algorithm of H.264, we create a large, specialized functional unit that is capable of executing 100s of operations per instruction. This improves performance and energy by an additional 25x and the final customized CMP matches an ASIC solution's performance within 3x of its energy and within comparable area.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "ASIC; chip multiprocessor; customization; energy efficiency; h.264; high performance; Tensilica", } @Article{Barr:2010:TCS, author = "Thomas W. Barr and Alan L. Cox and Scott Rixner", title = "Translation caching: skip, don't walk (the page table)", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "48--59", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1815970", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper explores the design space of MMU caches that accelerate virtual-to-physical address translation in processor architectures, such as x86-64, that use a radix tree page table. In particular, these caches accelerate the page table walk that occurs after a miss in the Translation Lookaside Buffer. This paper shows that the most effective MMU caches are translation caches, which store partial translations and allow the page walk hardware to skip one or more levels of the page table.\par In recent years, both AMD and Intel processors have implemented MMU caches. However, their implementations are quite different and represent distinct points in the design space. This paper introduces three new MMU cache structures that round out the design space and directly compares the effectiveness of all five organizations. This comparison shows that two of the newly introduced structures, both of which are translation cache variants, are better than existing structures in many situations.\par Finally, this paper contributes to the age-old discourse concerning the relative effectiveness of different page table organizations. Generally speaking, earlier studies concluded that organizations based on hashing, such as the inverted page table, outperformed organizations based upon radix trees for supporting large virtual address spaces. However, these studies did not take into account the possibility of caching page table entries from the higher levels of the radix tree. This paper shows that any of the five MMU cache structures will reduce radix tree page table DRAM accesses far below an inverted page table.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "memory management; page walk caching; TLB", } @Article{Jaleel:2010:HPC, author = "Aamer Jaleel and Kevin B. Theobald and Simon C. {Steely, Jr.} and Joel Emer", title = "High performance cache replacement using re-reference interval prediction {(RRIP)}", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "60--71", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1815971", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Practical cache replacement policies attempt to emulate optimal replacement by predicting the re-reference interval of a cache block. The commonly used LRU replacement policy always predicts a near-immediate re-reference interval on cache hits and misses. Applications that exhibit a distant re-reference interval perform badly under LRU. Such applications usually have a working-set larger than the cache or have frequent bursts of references to non-temporal data (called scans). To improve the performance of such workloads, this paper proposes cache replacement using Re-reference Interval Prediction (RRIP). We propose Static RRIP (SRRIP) that is scan-resistant and Dynamic RRIP (DRRIP) that is both scan-resistant and thrash-resistant. Both RRIP policies require only 2-bits per cache block and easily integrate into existing LRU approximations found in modern processors. Our evaluations using PC games, multimedia, server and SPEC CPU2006 workloads on a single-core processor with a 2MB last-level cache (LLC) show that both SRRIP and DRRIP outperform LRU replacement on the throughput metric by an average of 4\% and 10\% respectively. Our evaluations with over 1000 multi-programmed workloads on a 4-core CMP with an 8MB shared LLC show that SRRIP and DRRIP outperform LRU replacement on the throughput metric by an average of 7\% and 9\% respectively. We also show that RRIP outperforms LFU, the state-of the art scan-resistant replacement algorithm to-date. For the cache configurations under study, RRIP requires 2X less hardware than LRU and 2.5X less hardware than LFU.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "replacement; scan resistance; shared cache; thrashing", } @Article{Stuecheli:2010:VWQ, author = "Jeffrey Stuecheli and Dimitris Kaseridis and David Daly and Hillery C. Hunter and Lizy K. John", title = "The virtual write queue: coordinating {DRAM} and last-level cache policies", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "72--82", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1815972", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In computer architecture, caches have primarily been viewed as a means to hide memory latency from the CPU. Cache policies have focused on anticipating the CPU's data needs, and are mostly oblivious to the main memory. In this paper, we demonstrate that the era of many-core architectures has created new main memory bottlenecks, and mandates a new approach: coordination of cache policy with main memory characteristics. Using the cache for memory optimization purposes, we propose a Virtual Write Queue which dramatically expands the memory controller's visibility of processor behavior, at low implementation overhead. Through memory-centric modification of existing policies, such as scheduled writebacks, this paper demonstrates that performance limiting effects of highly-threaded architectures can be overcome. We show that through awareness of the physical main memory layout and by focusing on writes, both read and write average latency can be shortened, memory power reduced, and overall system performance improved. Through full-system cycle-accurate simulations of SPEC cpu2006, we demonstrate that the proposed Virtual Write Queue achieves an average 10.9\% system-level throughput improvement on memory-intensive workloads, along with an overall reduction of 8.7\% in memory power across the whole suite.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cache-replacement; CMP many-core; DDR DDR2 DDR3; DRAM; DRAM-parameters; last-level-cache; memory-scheduling writeback; page-mode; write-queue; write-scheduling", } @Article{Wilkerson:2010:RCP, author = "Chris Wilkerson and Alaa R. Alameldeen and Zeshan Chishti and Wei Wu and Dinesh Somasekhar and Shih-lien Lu", title = "Reducing cache power with low-cost, multi-bit error-correcting codes", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "83--93", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1815973", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Technology advancements have enabled the integration of large on-die embedded DRAM (eDRAM) caches. eDRAM is significantly denser than traditional SRAMs, but must be periodically refreshed to retain data. Like SRAM, eDRAM is susceptible to device variations, which play a role in determining refresh time for eDRAM cells. Refresh power potentially represents a large fraction of overall system power, particularly during low-power states when the CPU is idle. Future designs need to reduce cache power without incurring the high cost of flushing cache data when entering low-power states.\par In this paper, we show the significant impact of variations on refresh time and cache power consumption for large eDRAM caches. We propose Hi-ECC, a technique that incorporates multi-bit error-correcting codes to significantly reduce refresh rate. Multi-bit error-correcting codes usually have a complex decoder design and high storage cost. Hi-ECC avoids the decoder complexity by using strong ECC codes to identify and disable sections of the cache with multi-bit failures, while providing efficient single-bit error correction for the common case. Hi-ECC includes additional optimizations that allow us to amortize the storage cost of the code over large data words, providing the benefit of multi-bit correction at same storage cost as a single-bit error-correcting (SECDED) code (2\% overhead). Our proposal achieves a 93\% reduction in refresh power vs. a baseline eDRAM cache without error correcting capability, and a 66\% reduction in refresh power vs. a system using SECDED codes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "DRAM; ECC; eDRAM; idle power; idle states; multi-bit ECC; refresh power; Vccmin", } @Article{Xue:2010:ICF, author = "Jing Xue and Alok Garg and Berkehan Ciftcio{\u{g}}lu and Jianyun Hu and Shang Wang and Ioannis Savidis and Manish Jain and Rebecca Berman and Peng Liu and Michael Huang and Hui Wu and Eby Friedman and Gary Wicks and Duncan Moore", title = "An intra-chip free-space optical interconnect", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "94--105", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1815975", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Continued device scaling enables microprocessors and other systems-on-chip (SoCs) to increase their performance, functionality, and hence, complexity. Simultaneously, relentless scaling, if uncompensated, degrades the performance and signal integrity of on-chip metal interconnects. These systems have therefore become increasingly communications-limited. The communications-centric nature of future high performance computing devices demands a fundamental change in intra- and inter-chip interconnect technologies.\par Optical interconnect is a promising long term solution. However, while significant progress in optical {\em signaling\/} has been made in recent years, {\em networking\/} issues for on-chip optical interconnect still require much investigation. Taking the underlying optical signaling systems as a drop-in replacement for conventional electrical signaling while maintaining conventional packet-switching architectures is unlikely to realize the full potential of optical interconnects. In this paper, we propose and study the design of a fully distributed interconnect architecture based on free-space optics. The architecture leverages a suite of newly-developed or emerging devices, circuits, and optics technologies. The interconnect avoids packet relay altogether, offers an ultra-low transmission latency and scalable bandwidth, and provides fresh opportunities for coherency substrate designs and optimizations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "3d; free-space optical interconnect; intra-chip", } @Article{Das:2010:AEP, author = "Reetuparna Das and Onur Mutlu and Thomas Moscibroda and Chita R. Das", title = "{A{\'e}rgia}: exploiting packet latency slack in on-chip networks", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "106--116", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1815976", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Traditional Network-on-Chips (NoCs) employ simple arbitration strategies, such as round-robin or oldest-first, to decide which packets should be prioritized in the network. This is counter-intuitive since different packets can have very different effects on system performance due to, e.g., different level of memory-level parallelism (MLP) of applications. Certain packets may be performance-critical because they cause the processor to stall, whereas others may be delayed for a number of cycles with no effect on application-level performance as their latencies are hidden by other outstanding packets' latencies. In this paper, we define slack as a key measure that characterizes the relative importance of a packet. Specifically, the slack of a packet is the number of cycles the packet can be delayed in the network with no effect on execution time. This paper proposes new router prioritization policies that exploit the available slack of interfering packets in order to accelerate performance-critical packets and thus improve overall system performance. When two packets interfere with each other in a router, the packet with the lower slack value is prioritized. We describe mechanisms to estimate slack, prevent starvation, and combine slack-based prioritization with other recently proposed application-aware prioritization mechanisms.\par We evaluate slack-based prioritization policies on a 64-core CMP with an 8x8 mesh NoC using a suite of 35 diverse applications. For a representative set of case studies, our proposed policy increases average system throughput by 21.0\% over the commonly-used round-robin policy. Averaged over 56 randomly-generated multiprogrammed workload mixes, the proposed policy improves system throughput by 10.3\%, while also reducing application-level unfairness by 30.8\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "arbitration; memory systems; multi-core; on-chip networks; packet scheduling; prioritization", } @Article{Koka:2010:SPN, author = "Pranay Koka and Michael O. McCracken and Herb Schwetman and Xuezhe Zheng and Ron Ho and Ashok V. Krishnamoorthy", title = "Silicon-photonic network architectures for scalable, power-efficient multi-chip systems", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "117--128", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1815977", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Scaling trends of logic, memories, and interconnect networks lead towards dense many-core chips. Unfortunately, process yields and reticle sizes limit the scalability of large single-chip systems. Multi-chip systems break free of these areal limits, but in turn require enormous chip-to-chip bandwidth. The 'macrochip' concept presented here integrates multiple many-core processor chips in a single package with silicon-photonic interconnects. This design enables a multi-chip system to approach the performance of a single large die.\par In this paper we propose three silicon-photonic network designs that provide low-power, high-bandwidth inter-die communication: a static wavelength-routed point-to-point network, a 'two-phase' arbitrated network, and a limited-connectivity point-to-point network. We also adapt two existing intra-chip silicon-photonic interconnects: a token-ring-based crossbar and a circuit-switched torus.\par We simulate a 64-die, 512-core cache-coherent macrochip using all of the above networks with synthetic kernels, and kernels from Splash-2 and PARSEC. We evaluate the networks on performance, optical power and complexity. Despite a narrow data-path width compared to the token-ring or torus, the point-to-point performs 3.3x and 3.9x better respectively. We show that the point-to-point is over 10x more power-efficient than the other networks. We also show that, contrary to electronic network designs, a point-to-point network has the lowest design complexity for an inter-chip silicon-photonic network.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "interconnection networks; nanophotonics", } @Article{Beamer:2010:RAD, author = "Scott Beamer and Chen Sun and Yong-Jin Kwon and Ajay Joshi and Christopher Batten and Vladimir Stojanovi{\'c} and Krste Asanovi{\'c}", title = "Re-architecting {DRAM} memory systems with monolithically integrated silicon photonics", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "129--140", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1815978", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The performance of future manycore processors will only scale with the number of integrated cores if there is a corresponding increase in memory bandwidth. Projected scaling of electrical DRAM architectures appears unlikely to suffice, being constrained by processor and DRAM pin-bandwidth density and by total DRAM chip power, including off-chip signaling, cross-chip interconnect, and bank access energy. In this work, we redesign the DRAM main memory system using a proposed monolithically integrated silicon photonics technology and show that our photonically interconnected DRAM (PIDRAM) provides a promising solution to all of these issues. Photonics can provide high aggregate pin-bandwidth density through dense wavelength-division multiplexing. Photonic signaling provides energy-efficient communication, which we exploit to not only reduce chip-to-chip interconnect power but to also reduce cross-chip interconnect power by extending the photonic links deep into the actual PIDRAM chips. To complement these large improvements in interconnect bandwidth and power, we decrease the number of bits activated per bank to improve the energy efficiency of the PIDRAM banks themselves. Our most promising design point yields approximately a 10x power reduction for a single-chip PIDRAM channel with similar throughput and area as a projected future electrical-only DRAM. Finally, we propose optical power guiding as a new technique that allows a single PIDRAM chip design to be used efficiently in several multi-chip configurations that provide either increased aggregate capacity or bandwidth.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "dram architecture; energy-efficiency; silicon photonics", } @Article{Schechter:2010:UEE, author = "Stuart Schechter and Gabriel H. Loh and Karin Straus and Doug Burger", title = "Use {ECP}, not {ECC}, for hard failures in resistive memories", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "141--152", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1815980", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As leakage and other charge storage limitations begin to impair the scalability of DRAM, non-volatile resistive memories are being developed as a potential replacement. Unfortunately, current error correction techniques are poorly suited to this emerging class of memory technologies. Unlike DRAM, PCM and other resistive memories have wear lifetimes, measured in writes, that are sufficiently short to make cell failures common during a system's lifetime. However, resistive memories are much less susceptible to transient faults than DRAM. The Hamming-based ECC codes used in DRAM are designed to handle transient faults with no effective lifetime limits, but ECC codes applied to resistive memories would wear out faster than the cells they are designed to repair. This paper evaluates {\em Error-Correcting Pointers\/} (ECP), a new approach to error correction optimized for memories in which errors are the result of permanent cell failures that occur, and are immediately detectable, at write time. ECP corrects errors by permanently encoding the locations of failed cells into a table and assigning cells to replace them. ECP provides longer lifetimes than previously proposed solutions with equivalent overhead. What's more, as the level of variance in cell lifetimes increases -- a likely consequence of further scaling -- ECP's margin of improvement over existing schemes increases.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "error correction; hard failures; memory; phase change memory; resistive memories", } @Article{Qureshi:2010:MMS, author = "Moinuddin K. Qureshi and Michele M. Franceschini and Luis A. Lastras-Monta{\~n}o and John P. Karidis", title = "Morphable memory system: a robust architecture for exploiting multi-level phase change memories", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "153--162", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1815981", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Phase Change Memory (PCM) is emerging as a scalable and power efficient technology to architect future main memory systems. The scalability of PCM is enhanced by the property that PCM devices can store multiple bits per cell. While such Multi-Level Cell (MLC) devices can offer high density, this benefit comes at the expense of increased read latency, which can cause significant performance degradation. This paper proposes {\em Morphable Memory System (MMS)}, a robust architecture for efficiently incorporating MLC PCM devices in main memory. MMS is based on observation that memory requirement varies between workloads, and systems are typically over-provisioned in terms of memory capacity. So, during a phase of low memory usage, some of the MLC devices can be operated at fewer bits per cell to obtain lower latency. When the workload requires full memory capacity, these devices can be restored to high density MLC operation to have full main-memory capacity. We provide the runtime monitors, the hardware-OS interface, and the detailed mechanism for implementing MMS. Our evaluations on an 8-core 8GB MLC PCM-based system show that MMS provides, on average, low latency access for 95\% of all memory requests, thereby improving overall system performance by 40\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "morphable memory; multi-level cell; phase change memory", } @Article{Pritchett:2010:SHS, author = "Timothy Pritchett and Mithuna Thottethodi", title = "{SieveStore}: a highly-selective, ensemble-level disk cache for cost-performance", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "163--174", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1815982", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Emerging solid-state storage media can significantly improve storage performance and energy. However, the high cost-per-byte of solid-state media has hindered wide-spread adoption in servers. This paper proposes a new, cost-effective architecture - SieveStore - which enables the use of solid-state media to significantly filter access to storage ensembles. Our paper makes three key contributions. First, we make a case for highly-selective, storage-ensemble-level disk-block caching based on the highly-skewed block popularity distribution and based on the dynamic nature of the popular block set. Second, we identify the problem of {\em allocation-writes\/} and show that selective cache allocation to reduce allocation-writes - {\em sieving\/} - is fundamental to enable efficient ensemble-level disk-caching. Third, we propose two practical variants of SieveStore. Based on week-long block access traces from a storage ensemble of 13 servers, we find that the two components (sieving and ensemble-level caching) each contribute to SieveStore's cost-effectiveness. Compared to unsieved, ensemble-level disk-caches, SieveStore achieves significantly higher hit ratios (35\%-50\% more, on average) while using only 1/7$^{th}$ the number of SSD drives. Further, ensemble-level caching is strictly better in cost-performance compared to per-server caching.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "disk cache; flash memory; selective allocation; solid state disks; storage; storage ensembles", } @Article{Udipi:2010:RDD, author = "Aniruddha N. Udipi and Naveen Muralimanohar and Niladrish Chatterjee and Rajeev Balasubramonian and Al Davis and Norman P. Jouppi", title = "Rethinking {DRAM} design and organization for energy-constrained multi-cores", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "175--186", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1815983", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "DRAM vendors have traditionally optimized the cost-per-bit metric, often making design decisions that incur energy penalties. A prime example is the overfetch feature in DRAM, where a single request activates thousands of bit-lines in many DRAM chips, only to return a single cache line to the CPU. The focus on cost-per-bit is questionable in modern-day servers where operating costs can easily exceed the purchase cost. Modern technology trends are also placing very different demands on the memory system: (i)queuing delays are a significant component of memory access time, (ii) there is a high energy premium for the level of reliability expected for business-critical computing, and (iii) the memory access stream emerging from multi-core systems exhibits limited locality. All of these trends necessitate an overhaul of DRAM architecture, even if it means a slight compromise in the cost-per-bit metric.\par This paper examines three primary innovations. The first is a modification to DRAM chip microarchitecture that retains the traditional DDRx SDRAM interface. Selective Bit-line Activation (SBA) waits for both RAS (row address) and CAS (column address) signals to arrive before activating exactly those bitlines that provide the requested cache line. SBA reduces energy consumption while incurring slight area and performance penalties. The second innovation, Single Subarray Access (SSA), fundamentally re-organizes the layout of DRAM arrays and the mapping of data to these arrays so that an entire cache line is fetched from a single subarray. It requires a different interface to the memory controller, reduces dynamic and background energy (by about 6X), incurs a slight area penalty (4\%), and can even lead to performance improvements (54\% on average) by reducing queuing delays. The third innovation further penalizes the cost-per-bit metric by adding a checksum feature to each cache line. This checksum error-detection feature can then be used to build stronger RAID-like fault tolerance, including chipkill-level reliability. Such a technique is especially crucial for the SSA architecture where the entire cache line is localized to a single chip. This DRAM chip microarchitectural change leads to a dramatic reduction in the energy and storage overheads for reliability. The proposed architectures will also apply to other emerging memory technologies (such as resistive memories) and will be less disruptive to standards, interfaces, and the design flow if they can be incorporated into first-generation designs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "chipkill; dram architecture; energy-efficiency; locality; subarrays", } @Article{Chen:2010:LPP, author = "Yunji Chen and Weiwu Hu and Tianshi Chen and Ruiyang Wu", title = "{LReplay}: a pending period based deterministic replay scheme", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "187--197", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1815985", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Debugging parallel program is a well-known difficult problem. A promising method to facilitate debugging parallel program is using hardware support to achieve deterministic replay. A hardware-assisted deterministic replay scheme should have a small log size, as well as low design cost, to be feasible for adopting by industrial processors. To achieve the goals, we propose a novel and succinct hardware-assisted deterministic replay scheme named LReplay. The key innovation of LReplay is that instead of recording the logical time orders between instructions or instruction blocks as previous investigations, LReplay is built upon recording the pending period information [6]. According to the experimental results on Godson-3, the overall log size of LReplay is about 0.55B/K-Inst (byte per k-instruction) for sequential consistency, and 0.85B/K-Inst for Godson-3 consistency. The log size is smaller in an order of magnitude than state-of-art deterministic replay schemes incurring no performance loss. Furthermore, LReplay only consumes about $ 1.3 \% $ area of Godson-3, since it requires only trivial modifications to the existing components of Godson-3. The above features of LReplay demonstrate the potential of integrating hardware-assisted deterministic replay into future industrial processors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "deterministic replay; DFD; global clock; multi-core processor; pending period; physical time order", } @Article{Voskuilen:2010:TEA, author = "Gwendolyn Voskuilen and Faraz Ahmad and T. N. Vijaykumar", title = "{Timetraveler}: exploiting acyclic races for optimizing memory race recording", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "198--209", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1815986", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As chip multiprocessors emerge as the prevalent microprocessor architecture, support for debugging shared-memory parallel programs becomes important. A key difficulty is the programs' nondeterministic semantics due to which replay runs of a buggy program may not reproduce the bug. The non-determinism stems from memory races where accesses from two threads, at least one of which is a write, go to the same memory location. Previous hardware schemes for memory race recording log the predecessor-successor thread ordering at memory races and enforce the same orderings in the replay run to achieve deterministic replay. To reduce the log size, the schemes exploit transitivity in the orderings to avoid recording redundant orderings. To reduce the log size further while requiring minimal hardware, we propose {\em Timetraveler\/} which for the first time exploits acyclicity of races based on the key observation that an acyclic race need not be recorded even if the race is not covered already by transitivity. Timetraveler employs a novel and elegant mechanism called {\em post-dating\/} which both ensures that acyclic races, including those through the L2, are eventually ordered correctly, and identifies cyclic races. To address false cycles through the L2, Timetraveler employs another novel mechanism called {\em time-delay buffer\/} which delays the advancement of the L2 banks' timestamps and thereby reduces the false cycles. Using simulations, we show that Timetraveler reduces the log size for commercial workloads by 88\% over the best previous approach while using only a 696-byte time-delay buffer.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "debugging; determinism; race recording; replay", } @Article{Lucia:2010:CES, author = "Brandon Lucia and Luis Ceze and Karin Strauss and Shaz Qadeer and Hans-J. Boehm", title = "Conflict exceptions: simplifying concurrent language semantics with precise hardware exceptions for data-races", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "210--221", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1815987", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We argue in this paper that concurrency errors should be treated as exceptions, {\em i.e.}, have fail-stop behavior and precise semantics. We propose an exception model based on conflict of synchronization free regions, which precisely detects a broad class of data-races. We show that our exceptions provide enough guarantees to simplify high-level programming language semantics and debugging, but are significantly cheaper to enforce than traditional data-race detection. To make the performance cost of enforcement negligible, we propose architecture support for accurately detecting and precisely delivering these exceptions. We evaluate the suitability of our model as well as the behavior of our architectural mechanisms using the PARSEC benchmark suite and commercial applications. Our results show that the exception model largely reflects how programmers are already writing code and that the main memory, traffic and performance overheads of the enforcement mechanisms we propose are very low.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "bug detection; data-races; memory consistency models; multicores; threads", } @Article{Lucia:2010:CAS, author = "Brandon Lucia and Luis Ceze and Karin Strauss", title = "{ColorSafe}: architectural support for debugging and dynamically avoiding multi-variable atomicity violations", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "222--233", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1815988", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper, we propose ColorSafe, an architecture that detects and dynamically avoids single- and multi-variable atomicity violation bugs. The key idea is to group related data into colors and then monitor access interleavings in the 'color space'. This enables detection of atomicity violations involving any data of the same color. We leverage support for meta-data to maintain color information, and signatures to efficiently keep recent color access histories. ColorSafe dynamically avoids atomicity violations by inserting ephemeral transactions that prevent erroneous interleavings. ColorSafe has two modes of operation: (1) {\em debugging mode\/} makes detection more precise, producing fewer false positives and collecting more information; and, (2) {\em deployment mode\/} provides robust, efficient dynamic bug avoidance with less precise detection. This makes ColorSafe useful throughout the lifetime of programs, not just during development. Our results show that, in deployment mode, ColorSafe is able to successfully avoid the majority of multi-variable atomicity violations in bug kernels, as well as in large applications (Apache and MySQL). In debugging mode, ColorSafe detects bugs with few false positives.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "atomicity violations; bug avoidance; concurrency errors; data coloring; debugging; multi-variable", } @Article{Irwin:2010:SCM, author = "Mary Jane Irwin", title = "Shared caches in multicores: the good, the bad, and the ugly", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "234--234", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1815990", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As we transition from clock-frequency performance scaling to performance scaling with multicores, the pressure on the memory hierarchy is increasing dramatically. Many different on-chip cache topologies have been proposed/implemented; effective management of these shared caches is crucial to multicore performance.\par This talk will begin with a description of a cache miss classification scheme for multicores (compulsory, inter-core misses, intra-core misses) that gives insight into the interactions between memory transactions of the different cores on a chip sharing a cache. Ways to improve the on-chip cache performance with architectural enhancements, compiler enhancements, and runtime system enhancements will then be discussed. If the application thread mapping and the on-chip topology is static (i.e., does not change during runtime), then compiler enhancements that support cache topology aware code optimization can be used to significantly improve an application's performance. Results from such an augmented compiler, where the topology is exposed to the compiler and where the compiler also does thread-to-core mapping assignments, will be presented. If the application thread mapping or the on-chip topology is dynamic, then other alternatives exist. For example, a thread scheduler, or allocator, can make decisions about moving threads to different cores during runtime in the hopes of improving overall cache performance. Initial experiments with the REEact system being developed by researchers at Penn State--UPittsburgh--UVirginia that 'reacts' to hardware conditions (such as cache miss rates, hot-spots, etc.) by reallocating threads at runtime will be outlined. Finally, if the on-chip cache topology itself is dynamic (i.e., is designed to be reconfigurable at runtime), large performance benefits might be obtained. However, both hardware and software design challenges to realizing such a dynamic system abound. Some of these challenges will be briefly discussed.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "caches; multicore", } @Article{Meng:2010:DWS, author = "Jiayuan Meng and David Tarjan and Kevin Skadron", title = "Dynamic warp subdivision for integrated branch and memory divergence tolerance", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "235--246", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1815992", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "SIMD organizations amortize the area and power of fetch, decode, and issue logic across multiple processing units in order to maximize throughput for a given area and power budget. However, throughput is reduced when a set of threads operating in lockstep (a warp) are stalled due to long latency memory accesses. The resulting idle cycles are extremely costly. Multi-threading can hide latencies by interleaving the execution of multiple warps, but deep multi-threading using many warps dramatically increases the cost of the register files (multi-threading depth $ \times $ SIMD width), and cache contention can make performance worse. Instead, intra-warp latency hiding should first be exploited. This allows threads that are ready but stalled by SIMD restrictions to use these idle cycles and reduces the need for multi-threading among warps. This paper introduces {\em dynamic warp subdivision\/} (DWS), which allows a single warp to occupy more than one slot in the scheduler without requiring extra register file space. Independent scheduling entities allow divergent branch paths to interleave their execution, and allow threads that hit to run ahead. The result is improved latency hiding and memory level parallelism (MLP). We evaluate the technique on a coherent cache hierarchy with private L1 caches and a shared L2 cache. With an area overhead of less than 1\%, experiments with eight data-parallel benchmarks show our technique improves performance on average by 1.7$ \times $.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "branch divergence; cache; latency hiding; memory divergence; SIMD; warp", } @Article{Chakradhar:2010:DCC, author = "Srimat Chakradhar and Murugan Sankaradas and Venkata Jakkula and Srihari Cadambi", title = "A dynamically configurable coprocessor for convolutional neural networks", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "247--257", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1815993", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Convolutional neural networks (CNN) applications range from recognition and reasoning (such as handwriting recognition, facial expression recognition and video surveillance) to intelligent text applications such as semantic text analysis and natural language processing applications. Two key observations drive the design of a new architecture for CNN. First, CNN workloads exhibit a {\em widely varying mix of three types of parallelism\/}: parallelism within a convolution operation, intra-output parallelism where multiple input sources (features) are combined to create a single output, and inter-output parallelism where multiple, independent outputs (features) are computed simultaneously. Workloads differ significantly across different CNN applications, and across different layers of a CNN. Second, the number of processing elements in an architecture continues to scale (as per Moore's law) much faster than off-chip memory bandwidth (or pin-count) of chips. Based on these two observations, we show that for a given number of processing elements and off-chip memory bandwidth, a new CNN hardware architecture that dynamically configures the hardware on-the-fly to match the specific mix of parallelism in a given workload gives the best throughput performance. Our CNN compiler automatically translates high abstraction network specification into a parallel microprogram (a sequence of low-level VLIW instructions) that is mapped, scheduled and executed by the coprocessor. Compared to a 2.3 GHz quad-core, dual socket Intel Xeon, 1.35 GHz C870 GPU, and a 200 MHz FPGA implementation, our 120 MHz dynamically configurable architecture is 4x to 8x faster. This is the {\em first CNN architecture to achieve real-time video stream processing\/} (25 to 30 frames per second) on a wide range of object detection and recognition tasks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "convolutional neural networks; dynamic reconfiguration; parallel computer architecture", } @Article{Blundell:2010:RTR, author = "Colin Blundell and Arun Raghavan and Milo M. K. Martin", title = "{RETCON}: transactional repair without replay", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "258--269", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1815995", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Over the past decade there has been a surge of academic and industrial interest in optimistic concurrency, {\em i.e.\/} the speculative parallel execution of code regions that have the semantics of isolation. This work analyzes scalability bottlenecks of workloads that use optimistic concurrency. We find that one common bottleneck is updates to auxiliary program data in otherwise non-conflicting operations, {\em e.g.\/} reference count updates and hashtable occupancy field increments.\par To eliminate the performance impact of conflicts on such auxiliary data, this work proposes RETCON, a hardware mechanism that tracks the relationship between input and output values symbolically and uses this symbolic information to transparently repair the output state of a transaction at commit. RETCON is inspired by instruction replay-based mechanisms but exploits simplifying properties of the nature of computations on auxiliary data to perform repair {\em without\/} replay. Our experiments show that RETCON provides significant speedups for workloads that exhibit conflicts on auxiliary data, including transforming a transactionalized version of the Python interpreter from a workload that exhibits no scaling to one that exhibits near-linear scaling on 32 cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "parallel programming; transactional memory", } @Article{Lee:2010:TTD, author = "Janghaeng Lee and Haicheng Wu and Madhumitha Ravichandran and Nathan Clark", title = "{Thread Tailor}: dynamically weaving threads together for efficient, adaptive parallel applications", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "270--279", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1815996", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Extracting performance from modern parallel architectures requires that applications be divided into many different threads of execution. Unfortunately selecting the appropriate number of threads for an application is a daunting task. Having too many threads can quickly saturate shared resources, such as cache capacity or memory bandwidth, thus degrading performance. On the other hand, having too few threads makes inefficient use of the resources available. Beyond static resource assignment, the program inputs and dynamic system state (e.g., what other applications are executing in the system) can have a significant impact on the right number of threads to use for a particular application.\par To address this problem we present the Thread Tailor, a dynamic system that automatically adjusts the number of threads in an application to optimize system efficiency. The Thread Tailor leverages offline analysis to estimate what type of threads will exist at runtime and the communication patterns between them. Using this information Thread Tailor dynamically combines threads to better suit the needs of the target system. Thread Tailor adjusts not only to the architecture, but also other applications in the system, and this paper demonstrates that this type of adjustment can lead to significantly better use of thread-level parallelism in real-world architectures.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "dynamic compilation; managed parallelism; threading", } @Article{Hong:2010:IGP, author = "Sunpyo Hong and Hyesoon Kim", title = "An integrated {GPU} power and performance model", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "280--289", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1815998", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "GPU architectures are increasingly important in the multi-core era due to their high number of parallel processors. Performance optimization for multi-core processors has been a challenge for programmers. Furthermore, optimizing for power consumption is even more difficult. Unfortunately, as a result of the high number of processors, the power consumption of many-core processors such as GPUs has increased significantly.\par Hence, in this paper, we propose an integrated power and performance (IPP) prediction model for a GPU architecture to predict the optimal number of active processors for a given application. The basic intuition is that when an application reaches the peak memory bandwidth, using more cores does not result in performance improvement.\par We develop an empirical power model for the GPU. Unlike most previous models, which require measured execution times, hardware performance counters, or architectural simulations, IPP predicts execution times to calculate dynamic power events. We then use the outcome of IPP to control the number of running cores. We also model the increases in power consumption that resulted from the increases in temperature.\par With the predicted optimal number of active cores, we show that we can save up to 22.09\%of runtime GPU energy consumption and on average 10.99\% of that for the five memory bandwidth-limited benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "analytical model; CUDA; energy; GPU architecture; performance; power estimation", } @Article{Tan:2010:CFF, author = "Zhangxi Tan and Andrew Waterman and Henry Cook and Sarah Bird and Krste Asanovi{\'c} and David Patterson", title = "A case for {FAME}: {FPGA} architecture model execution", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "290--301", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1815999", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Given the multicore microprocessor revolution, we argue that the architecture research community needs a dramatic increase in simulation capacity. We believe FPGA Architecture Model Execution (FAME) simulators can increase the number of useful architecture research experiments per day by two orders of magnitude over Software Architecture Model Execution (SAME) simulators. To clear up misconceptions about FPGA-based simulation methodologies, we propose a FAME taxonomy to distinguish the cost-performance of variations on these ideas. We demonstrate our simulation speedup claim with a case study wherein we employ a prototype FAME simulator, RAMP Gold, to research the interaction between hardware partitioning mechanisms and operating system scheduling policy. The study demonstrates FAME's capabilities: we run a modern parallel benchmark suite on a research operating system, simulate 64-core target architectures with multi-level memory hierarchy timing models, and add experimental hardware mechanisms to the target machine. The simulation speedup achieved by our adoption of FAME-250\times -enables experiments with more realistic time scales and data set sizes than are possible with SAME.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "FPGA; microprocessors; simulation", } @Article{Blake:2010:ETL, author = "Geoffrey Blake and Ronald G. Dreslinski and Trevor Mudge and Kriszti{\'a}n Flautner", title = "Evolution of thread-level parallelism in desktop applications", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "302--313", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1816000", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As the effective limits of frequency and instruction level parallelism have been reached, the strategy of microprocessor vendors has changed to increase the number of processing cores on a single chip each generation. The implicit expectation is that software developers will write their applications with concurrency in mind to take advantage of this sudden change in direction. In this study we analyze whether software developers for laptop/desktop machines have followed the recent hardware trends by creating software for chip multi-processing. We conduct a study of a wide range of applications on Microsoft Windows 7 and Apple's OS X Snow Leopard, measuring {\em Thread Level Parallelism\/} on a high performance workstation and a low power desktop. In addition, we explore graphics processing units (GPUs) and their impact on chip multi-processing. We compare our findings to a study done 10 years ago which concluded that a second core was sufficient to improve system responsiveness. Our results on today's machines show that, 10 years later, surprisingly 2-3 cores are more than adequate for most applications and that the GPU often remains under-utilized. However, in some application specific domains an 8 core SMT system with a 240 core GPU can be effectively utilized. Overall these studies suggest that many-core architectures are not a natural fit for current desktop/laptop applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "benchmarking; desktop applications; multi-core; thread level parallelism", } @Article{Reddi:2010:WSU, author = "Vijay Janapa Reddi and Benjamin C. Lee and Trishul Chilimbi and Kushagra Vaid", title = "{Web} search using mobile cores: quantifying and mitigating the price of efficiency", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "314--325", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1816002", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The commoditization of hardware, data center economies of scale, and Internet-scale workload growth all demand greater power efficiency to sustain scalability. Traditional enterprise workloads, which are typically memory and I/O bound, have been well served by chip multiprocessors comprised of small, power-efficient cores. Recent advances in mobile computing have led to modern small cores capable of delivering even better power efficiency. While these cores can deliver performance-per-Watt efficiency for data center workloads, small cores impact application quality-of-service robustness, and flexibility, as these workloads increasingly invoke computationally intensive kernels. These challenges constitute the price of efficiency. We quantify efficiency for an industry-strength online web search engine in production at both the microarchitecture- and system-level, evaluating search on server and mobile-class architectures using Xeon and Atom processors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "bing; energy efficiency; mobile cores; web search", } @Article{Soundararajan:2010:IMO, author = "Vijayaraghavan Soundararajan and Jennifer M. Anderson", title = "The impact of management operations on the virtualized datacenter", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "326--337", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1816003", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Virtualization has the potential to dramatically reduce the total cost of ownership of datacenters and increase the flexibility of deployments for general-purpose workloads. If present trends continue, the datacenter of the future will be largely virtualized. The base platform in such a datacenter will consist of physical hosts that run hypervisors, and workloads will run within virtual machines on these platforms. From a system management perspective, the virtualized environment enables a number of new workflows in the datacenter. These workflows involve operations on the physical hosts themselves, such as upgrading the hypervisor, as well as operations on the virtual machines, such as reconfiguration or reverting from snapshots. While traditional datacenter design has focused on the cost vs. capability tradeoffs for the end-user applications running in the datacenter, we argue that the management workload from these workflows must be factored into the design of the virtualized datacenter.\par In this paper, we examine data from real-world virtualized deployments to characterize common management workflows and assess their impact on resource usage in the datacenter. We show that while many end-user applications are fairly light on I/O requirements, the management workload has considerable network and disk I/O requirements. We show that the management workload scales with the increasing compute power in the datacenter. Finally, we discuss the implications of this management workload for the datacenter.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cloud computing; datacenter management; management workload; virtual machine management", } @Article{Abts:2010:EPD, author = "Dennis Abts and Michael R. Marty and Philip M. Wells and Peter Klausler and Hong Liu", title = "Energy proportional datacenter networks", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "338--347", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1816004", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Numerous studies have shown that datacenter computers rarely operate at full utilization, leading to a number of proposals for creating servers that are {\em energy proportional\/} with respect to the computation that they are performing.\par In this paper, we show that as servers themselves become more energy proportional, the datacenter network can become a significant fraction (up to 50\%) of cluster power. In this paper we propose several ways to design a high-performance datacenter network whose power consumption is more proportional to the amount of traffic it is moving -- that is, we propose {\em energy proportional datacenter networks}.\par We first show that a flattened butterfly topology itself is inherently more power efficient than the other commonly proposed topology for high-performance datacenter networks. We then exploit the characteristics of modern plesiochronous links to adjust their power and performance envelopes dynamically. Using a network simulator, driven by both synthetic workloads and production datacenter traces, we characterize and understand design tradeoffs, and demonstrate an 85\% reduction in power --- which approaches the ideal energy-proportionality of the network.\par Our results also demonstrate two challenges for the designers of future network switches: (1) We show that there is a significant power advantage to having independent control of each unidirectional channel comprising a network link, since many traffic patterns show very asymmetric use, and (2) system designers should work to optimize the high-speed channel designs to be more energy efficient by choosing optimal data rate and equalization technology. Given these assumptions, we demonstrate that energy proportional datacenter communication is indeed possible.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "datacenter networks; interconnection networks; low-power networking", } @Article{Thacker:2010:IFE, author = "Charles P. Thacker", title = "Improving the future by examining the past", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "348--348", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1816006", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "During the last fifty years, the technology underlying computer systems has improved dramatically. As technology has evolved, designers have made a series of choices in the way it was applied in computers. In some cases, decisions that were made in the twentieth century make less sense in the twenty-first. Conversely, paths not taken might now be more attractive given the state of technology today, particularly in light of the limits the field is facing, such as the increasing gap between processor speed and storage access times and the difficulty of cooling today's computers.\par In this talk, I'll discuss some of these choices and suggest some possible changes that might make computing better in the twenty-first century.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "Turing Award", } @Article{Temam:2010:RNN, author = "Olivier Temam", title = "The rebirth of neural networks", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "349--349", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1816008", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "After the hype of the 1990s, where companies like Intel or Philips built commercial hardware systems based on neural networks, the approach quickly lost ground for multiple reasons: hardware neural networks were no match for software neural networks run on rapidly progressing general-purpose processors, their application scope was considered too limited, and even progress in machine-learning theory overshadowed neural networks.\par However, in the past few years, a remarkable convergence of trends and innovations is casting a new light on neural networks and could make them valuable components of future computing systems. Trends in technology call for architectures which can sustain a large number of defects, something neural networks are intrinsically capable of. Tends in applications, summarized in the recent RMS categorization, highlight a number of key algorithms which are eligible to neural networks implementations. At the same time, innovations in technology, such as the recent realization of a memristor, are creating the conditions for the efficient hardware implementation of neural networks. Innovations in machine learning, with the recent advent of Deep Networks, have revived interest in neural networks. Finally, recent findings in neurobiology carry even greater prospects, where detailed explanations of how complex functions, such as vision, can be implemented further open up the defect-tolerance and application potential of neural network architectures.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "neural networks", } @Article{Keller:2010:NVC, author = "Eric Keller and Jakub Szefer and Jennifer Rexford and Ruby B. Lee", title = "{NoHype}: virtualized cloud infrastructure without the virtualization", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "350--361", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1816010", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Cloud computing is a disruptive trend that is changing the way we use computers. The key underlying technology in cloud infrastructures is virtualization -- so much so that many consider virtualization to be one of the key features rather than simply an implementation detail. Unfortunately, the use of virtualization is the source of a significant security concern. Because multiple virtual machines run on the same server and since the virtualization layer plays a considerable role in the operation of a virtual machine, a malicious party has the opportunity to attack the virtualization layer. A successful attack would give the malicious party control over the all-powerful virtualization layer, potentially compromising the confidentiality and integrity of the software and data of any virtual machine. In this paper we propose removing the virtualization layer, while retaining the key features enabled by virtualization. Our NoHype architecture, named to indicate the removal of the hypervisor, addresses each of the key roles of the virtualization layer: arbitrating access to CPU, memory, and I/O devices, acting as a network device (e.g., Ethernet switch), and managing the starting and stopping of guest virtual machines. Additionally, we show that our NoHype architecture may indeed be 'no hype' since nearly all of the needed features to realize the NoHype architecture are currently available as hardware extensions to processors and I/O devices.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cloud computing; hypervisor; many-core; multi-core; security; system architecture; virtualization", } @Article{Eyerman:2010:MCS, author = "Stijn Eyerman and Lieven Eeckhout", title = "Modeling critical sections in {Amdahl's Law} and its implications for multicore design", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "362--370", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1816011", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents a fundamental law for parallel performance: it shows that parallel performance is not only limited by sequential code (as suggested by Amdahl's law) but is also fundamentally limited by synchronization through critical sections. Extending Amdahl's software model to include critical sections, we derive the surprising result that the impact of critical sections on parallel performance can be modeled as a completely sequential part and a completely parallel part. The sequential part is determined by the probability for entering a critical section and the contention probability (i.e., multiple threads wanting to enter the same critical section). This fundamental result reveals at least three important insights for multicore design. (i) Asymmetric multicore processors deliver less performance benefits relative to symmetric processors than suggested by Amdahl's law, and in some cases even worse performance. (ii) Amdahl's law suggests many tiny cores for optimum performance in asymmetric processors, however, we find that fewer but larger small cores can yield substantially better performance. (iii) Executing critical sections on the big core can yield substantial speedups, however, performance is sensitive to the accuracy of the critical section contention predictor.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "Amdahl's law; analytical performance modeling; critical sections; synchronization", } @Article{Guo:2010:RCA, author = "Xiaochen Guo and Engin Ipek and Tolga Soyata", title = "Resistive computation: avoiding the power wall with low-leakage, {STT-MRAM} based computing", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "371--382", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1816012", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As CMOS scales beyond the 45nm technology node, leakage concerns are starting to limit microprocessor performance growth. To keep dynamic power constant across process generations, traditional MOSFET scaling theory prescribes reducing supply and threshold voltages in proportion to device dimensions, a practice that induces an exponential increase in subthreshold leakage. As a result, leakage power has become comparable to dynamic power in current-generation processes, and will soon exceed it in magnitude if voltages are scaled down any further. Beyond this inflection point, multicore processors will not be able to afford keeping more than a small fraction of all cores active at any given moment. Multicore scaling will soon hit a power wall.\par This paper presents resistive computation, a new technique that aims at avoiding the power wall by migrating most of the functionality of a modern microprocessor from CMOS to spin-torque transfer magnetoresistive RAM (STT-MRAM)---a CMOS-compatible, leakage-resistant, non-volatile resistive memory technology. By implementing much of the on-chip storage and combinational logic using leakage-resistant, scalable RAM blocks and lookup tables, and by carefully re-architecting the pipeline, an STT-MRAM based implementation of an eight-core Sun Niagara-like CMT processor reduces chip-wide power dissipation by 1.7\times and leakage power by 2.1\times at the 32nm technology node, while maintaining 93\% of the system throughput of a CMOS-based design.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "power-efficiency; STT-MRAM", } @Article{Seong:2010:SRP, author = "Nak Hee Seong and Dong Hyuk Woo and Hsien-Hsin S. Lee", title = "Security refresh: prevent malicious wear-out and increase durability for phase-change memory with dynamically randomized address mapping", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "383--394", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1816014", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Phase change memory (PCM) is an emerging memory technology for future computing systems. Compared to other non-volatile memory alternatives, PCM is more matured to production, and has a faster read latency and potentially higher storage density. The main roadblock precluding PCM from being used, in particular, in the main memory hierarchy, is its limited write endurance. To address this issue, recent studies proposed to either reduce PCM's write frequency or use wear-leveling to evenly distribute writes. Although these techniques can extend the lifetime of PCM, most of them will not prevent deliberately designed malicious codes from wearing it out quickly. Furthermore, all the prior techniques did not consider the circumstances of a compromised OS and its security implication to the overall PCM design. A compromised OS will allow adversaries to manipulate processes and exploit side channels to accelerate wear-out.\par In this paper, we argue that a PCM design not only has to consider normal wear-out under normal application behavior, most importantly, it must take the worst-case scenario into account with the presence of malicious exploits and a compromised OS to address the durability and security issues simultaneously. In this paper, we propose a novel, low-cost hardware mechanism called Security Refresh to avoid information leak by constantly migrating their physical locations inside the PCM, obfuscating the actual data placement from users and system software. It uses a dynamic randomized address mapping scheme that swaps data using random keys upon each refresh due. The hardware overhead is tiny without using any table. The best lifetime we can achieve under the worst-case malicious attack is more than six years. Also, our scheme incurs around 1\% performance degradation for normal program operations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "dynamic address remapping; phase change memory; security; wear leveling", } @Article{Huang:2010:ICM, author = "Ruirui Huang and G. Edward Suh", title = "{IVEC}: off-chip memory integrity protection for both security and reliability", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "395--406", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1816015", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper proposes a unified off-chip memory integrity protection scheme, named IVEC. Today, a system needs two independent mechanisms in order to protect the memory integrity from both physical attacks and random errors. Integrity verification schemes detect malicious tampering of memory while error correcting codes (ECC) detect and correct random errors. IVEC enables both detection of malicious attacks for security and correction of random errors for reliability at the same time by extending the integrity verification techniques. Analytical and experimental studies show that IVEC can correct single-bit errors and even multi-bit errors from one DRAM chip within a cache block read without any additional ECC bits, when the integrity verification is also required for security, effectively removing the memory and bandwidth overheads (12.5\%) of typical ECC schemes. Alternatively, with parity bits, IVEC can provide even stronger error correction capabilities comparable to the traditional chip-kill correct, still with less overheads. For both cases, IVEC can use standard non-ECC DIMMs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "error correction; error detection; fault tolerance; memory systems; reliability; security", } @Article{Shriraman:2010:SLW, author = "Arrvindh Shriraman and Sandhya Dwarkadas", title = "{Sentry}: light-weight auxiliary memory access control", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "407--418", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1816016", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Light-weight, flexible access control, which allows software to regulate reads and writes to any granularity of memory region, can help improve the reliability of today's multi-module multi-programmer applications, as well as the efficiency of software debugging tools. Unfortunately, access control in today's processors is tied to support for virtual memory, making its use both heavy weight and coarse grain. In this paper, we propose Sentry, an auxiliary level of virtual memory tagging that is entirely subordinate to existing virtual memory-based protection mechanisms and can be manipulated at the user level. We implement these tags in a complexity-effective manner using an M-cache (metadata cache) structure that only intervenes on L1 misses, thereby minimizing changes to the processor core. Existing cache coherence states are repurposed to implicitly validate permissions for L1 hits. Sentry achieves its goal of flexible and light-weight access control without disrupting existing inter-application protection, sidestepping the challenges associated with adding a new protection framework to an existing operating system.\par We illustrate the benefits of our design point using (1) an Apache-based web server that uses the M-cache to enforce protection boundaries among its modules and (2) a watchpoint-based tool to demonstrate low-overhead debugging. Protection is achieved with very few changes to the source code, no changes to the programming model, minimal modifications to the operating system, and with low overhead incurred only when accessing memory regions for which the additional level of access control is enabled.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "access control; cache coherence; memory protection; multiprocessors; protection domains; safety; sentry", } @Article{Herrero:2010:ECC, author = "Enric Herrero and Jos{\'e} Gonz{\'a}lez and Ramon Canal", title = "Elastic cooperative caching: an autonomous dynamically adaptive memory hierarchy for chip multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "419--428", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1816018", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Next generation tiled microarchitectures are going to be limited by off-chip misses and by on-chip network usage. Furthermore, these platforms will run an heterogeneous mix of applications with very different memory needs, leading to significant optimization opportunities. Existing adaptive memory hierarchies use either centralized structures that limit the scalability or software based resource allocation that increases programming complexity.\par We propose Elastic Cooperative Caching, a dynamic and scalable memory hierarchy that adapts automatically and autonomously to application behavior for each node. Our configuration uses elastic shared/private caches with fully autonomous and distributed repartitioning units for better scalability. Furthermore, we have extended our elastic configuration with an Adaptive Spilling mechanism to use the shared cache space only when it can produce a performance improvement. Elastic caches allow both the creation of big local private caches for threads with high reuse of private data and the creation of big shared spaces from unused caches. Local data allocation in private regions allows to reduce network usage and efficient cache partitioning allows to reduce off-chip misses.\par The proposed scheme outperforms previous proposals by a minimum of 12\% (on average across the benchmarks) and reduces the number of offchip misses by 16\%. Plus, the dynamic and autonomous management of cache resources avoids the reallocation of cache blocks without reuse which results in an increase in energy efficiency of 24\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "chip multiprocessors; elastic cooperative caching; memory hierarchy; tiled microarchitectures", } @Article{Kelm:2010:CHM, author = "John H. Kelm and Daniel R. Johnson and William Tuohy and Steven S. Lumetta and Sanjay J. Patel", title = "{Cohesion}: a hybrid memory model for accelerators", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "429--440", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1816019", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Two broad classes of memory models are available today: models with hardware cache coherence, used in conventional chip multiprocessors, and models that rely upon software to manage coherence, found in compute accelerators. In some systems, both types of models are supported using disjoint address spaces and/or physical memories. In this paper we present Cohesion, a hybrid memory model that enables fine-grained temporal reassignment of data between hardware-managed and software-managed coherence domains, allowing a system to support both. Cohesion can be used to dynamically adapt to the sharing needs of both applications and runtimes. Cohesion requires neither copy operations nor multiple address spaces.\par Cohesion offers the benefits of reduced message traffic and on-die directory overhead when software-managed coherence can be used and the advantages of hardware coherence for cases in which software-managed coherence is impractical. We demonstrate our protocol using a hierarchical, cached 1024-core processor with a single address space that supports both software-enforced coherence and a directory-based hardware coherence protocol. Relative to an optimistic, hardware-coherent baseline, a realizable Cohesion design achieves competitive performance with a 2\times reduction in message traffic, 2.1\times reduction in directory utilization, and greater robustness to on-die directory capacity.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "accelerator; cache coherence; computer architecture", } @Article{Suleman:2010:DMM, author = "M. Aater Suleman and Onur Mutlu and Jos{\'e} A. Joao and Khubaib and Yale N. Patt", title = "Data marshaling for multi-core architectures", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "441--450", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1816020", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Previous research has shown that Staged Execution (SE), i.e., dividing a program into segments and executing each segment at the core that has the data and/or functionality to best run that segment, can improve performance and save power. However, SE's benefit is limited because most segments access {\em inter-segment data}, i.e., data generated by the previous segment. When consecutive segments run on different cores, accesses to inter-segment data incur cache misses, thereby reducing performance. This paper proposes {\em Data Marshaling (DM)}, a new technique to eliminate cache misses to inter-segment data. DM uses profiling to identify instructions that generate inter-segment data, and adds only 96 bytes/core of storage overhead. We show that DM significantly improves the performance of two promising Staged Execution models, Accelerated Critical Sections and producer-consumer pipeline parallelism, on both homogeneous and heterogeneous multi-core systems. In both models, DM can achieve almost all of the potential of ideally eliminating cache misses to inter-segment data. DM's performance benefit increases with the number of cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "cmp; critical sections; pipelining; staged execution", } @Article{Lee:2010:DGV, author = "Victor W. Lee and Changkyu Kim and Jatin Chhugani and Michael Deisher and Daehyun Kim and Anthony D. Nguyen and Nadathur Satish and Mikhail Smelyanskiy and Srinivas Chennupaty and Per Hammarlund and Ronak Singhal and Pradeep Dubey", title = "Debunking the {100X} {GPU} vs. {CPU} myth: an evaluation of throughput computing on {CPU} and {GPU}", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "451--460", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1816021", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Recent advances in computing have led to an explosion in the amount of data being generated. Processing the ever-growing data in a timely manner has made throughput computing an important aspect for emerging applications. Our analysis of a set of important throughput computing kernels shows that there is an ample amount of parallelism in these kernels which makes them suitable for today's multi-core CPUs and GPUs. In the past few years there have been many studies claiming GPUs deliver substantial speedups (between 10X and 1000X) over multi-core CPUs on these kernels. To understand where such large performance difference comes from, we perform a rigorous performance analysis and find that after applying optimizations appropriate for both CPUs and GPUs the performance gap between an Nvidia GTX280 processor and the Intel Core i7-960 processor narrows to only 2.5x on average. In this paper, we discuss optimization techniques for both CPU and GPU, analyze what architecture features contributed to performance differences between the two architectures, and recommend a set of architectural features which provide significant improvement in architectural efficiency for throughput kernels.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "CPU architecture; GPU architecture; performance analysis; performance measurement; software optimization; throughput computing", } @Article{Sridharan:2010:UHV, author = "Vilas Sridharan and David R. Kaeli", title = "Using hardware vulnerability factors to enhance {AVF} analysis", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "461--472", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1816023", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Fault tolerance is now a primary design constraint for all major microprocessors. One step in determining a processor's compliance to its failure rate target is measuring the Architectural Vulnerability Factor (AVF) of each on-chip structure. The AVF of a hardware structure is the probability that a fault in the structure will affect the output of a program. While AVF generates meaningful insight into system behavior, it cannot quantify the vulnerability of an individual system component (hardware, user program, etc.), limiting the amount of insight that can be generated. To address this, prior work has introduced the Program Vulnerability Factor (PVF) to quantify the vulnerability of software. In this paper, we introduce and analyze the Hardware Vulnerability Factor (HVF) to quantify the vulnerability of hardware.\par HVF has three concrete benefits which we examine in this paper. First, HVF analysis can provide insight to hardware designers beyond that gained from AVF analysis alone. Second, separating AVF analysis into HVF and PVF steps can accelerate the AVF measurement process. Finally, HVF measurement enables runtime AVF estimation that combines compile-time PVF estimates with runtime HVF measurements. A key benefit of this technique is that it allows software developers to influence the runtime AVF estimates. We demonstrate that this technique can estimate AVF at runtime with an average absolute error of less than 3\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "architectural vulnerability factor; fault tolerance; reliability", } @Article{Ansari:2010:NES, author = "Amin Ansari and Shuguang Feng and Shantanu Gupta and Scott Mahlke", title = "{Necromancer}: enhancing system throughput by animating dead cores", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "473--484", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1816024", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Aggressive technology scaling into the nanometer regime has led to a host of reliability challenges in the last several years. Unlike on-chip caches, which can be efficiently protected using conventional schemes, the general core area is less homogeneous and structured, making tolerating defects a much more challenging problem. Due to the lack of effective solutions, disabling non-functional cores is a common practice in industry to enhance manufacturing yield, which results in a significant reduction in system throughput. Although a faulty core cannot be trusted to correctly execute programs, we observe in this work that for most defects, when starting from a valid architectural state, execution traces on a defective core actually coarsely resemble those of fault-free executions. In light of this insight, we propose a robust and heterogeneous core coupling execution scheme, Necromancer, that exploits a functionally dead core to improve system throughput by supplying hints regarding high-level program behavior. We partition the cores in a conventional CMP system into multiple groups in which each group shares a lightweight core that can be substantially accelerated using these execution hints from a potentially dead core. To prevent this {\em undead\/} core from wandering too far from the correct path of execution, we dynamically resynchronize architectural state with the lightweight core. For a 4-core CMP system, on average, our approach enables the coupled core to achieve 78.5\% of the performance of a fully functioning core. This defect tolerance and throughput enhancement comes at modest area and power overheads of 5.3\% and 8.5\%, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "execution abstraction; heterogeneous core coupling; manufacturing defects", } @Article{Yan:2010:LCL, author = "Guihai Yan and Xiaoyao Liang and Yinhe Han and Xiaowei Li", title = "Leveraging the core-level complementary effects of {PVT} variations to reduce timing emergencies in multi-core processors", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "485--496", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1816025", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Process, Voltage, and Temperature (PVT) variations can significantly degrade the performance benefits expected from next nanoscale technology. The primary circuit implication of the PVT variations is the resultant timing emergencies. In a multi-core processor running multiple programs, variations create spatial and temporal unbalance across the processing cores. Most prior schemes are dedicated to tolerating PVT variations individually for a single core, but ignore the opportunity of leveraging the complementary effects between variations and the intrinsic variation unbalance among individual cores. We find that the notorious delay impacts from different variations are not necessary aggregated. Cores with mild variations can share the violent workload from cores suffering large variations. If operated correctly, variations on different cores can help mitigating each other and result in a variation-mild environment. In this paper, we propose Timing Emergency Aware Thread Migration (TEA-TM), a delay sensor-based scheme to reduce system timing emergencies under PVT variations. Fourier transform and frequency domain analysis are conducted to provide the insights and the potential of the PVT co-optimization scheme. Experimental results show on average TEA-TM can help save up to 24\% throughput loss, at the same time improve the system fairness by 85\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "complimentary effects; delay sensor; PVT variations; thread migration; timing emergency", } @Article{deKruijf:2010:RAF, author = "Marc de Kruijf and Shuou Nomura and Karthikeyan Sankaralingam", title = "{Relax}: an architectural framework for software recovery of hardware faults", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "497--508", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1816026", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As technology scales ever further, device unreliability is creating excessive complexity for hardware to maintain the illusion of perfect operation. In this paper, we consider whether exposing hardware fault information to software and allowing software to control fault recovery simplifies hardware design and helps technology scaling.\par The combination of emerging applications and emerging many-core architectures makes software recovery a viable alternative to hardware-based fault recovery. Emerging applications tend to have {\em few I/O and memory side-effects}, which limits the amount of information that needs checkpointing, and they allow {\em discarding individual sub-computations\/} with small qualitative impact. Software recovery can harness these properties in ways that hardware recovery cannot.\par We describe Relax, an architectural framework for software recovery of hardware faults. Relax includes three core components: (1) an ISA extension that allows software to mark regions of code for software recovery, (2) a hardware organization that simplifies reliability considerations and provides energy efficiency with hardware recovery support removed, and (3) software support for compilers and programmers to utilize the Relax ISA. Applying Relax to counter the effects of process variation, our results show a 20\% energy efficiency improvement for PARSEC applications with only minimal source code changes and simpler hardware.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "reliability; software recovery", } @Article{Nuno-Maganda:2010:TCH, author = "Marco Nu{\~n}o-Maganda and Cesar Torres-Huitzil", title = "A temporal coding hardware implementation for spiking neural networks", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "2--7", month = sep, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1926367.1926369", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Spiking Neural Networks (SNNs) models have been explored in recent years due to its biological plausibility where temporal coding plays an important role. Biological arguments and computational experiments suggest than some perceptual tasks (vision and olfaction for instance) are well performed by these models. Moreover, some other applications such as machine learning might be benefited from this approach. However, efficient simulation and implementation of SNNs still remain an open challenge. There are several issues that must be addressed, being one of them the temporal coding of real-value data itself. In order to study the possibilities of embedded real-time implementations of large scale SNNs, we have first chosen to implement a well-known coding scheme based on Gaussian Receptive Fields (GRFs) to map real-value data into spike trains.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Morisita:2010:IEA, author = "Hirokazu Morisita and Kenta Inakagata and Yasunori Osana and Naoyuki Fujita and Hideharu Amano", title = "Implementation and evaluation of an arithmetic pipeline on {FLOPS-$2$D}: multi-{FPGA} system", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "8--13", month = sep, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1926367.1926370", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "UPACS (Unified Platform for Aerospace Computational Simulation) is one of the practical CFD (Computational Fluid Dynamics) packages supporting various selectability. A custom machine for efficient execution of MUSCL; a core functions of UPACS is implemented on FLOPS-2D (Flexibly Linkable Object for Programmable System); multi-FPGA reconfigurable system. The deep and complicated pipeline structure generated from MUSCL dataflow is divided and optimized into two FPGA boards by using a tuning tool called RER. With optimization of the order of operations and pipeline structure, about 60\% utilization of the pipeline is achieved even by using serial links between two boards.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tse:2010:ERD, author = "Anson H. T. Tse and David B. Thomas and K. H. Tsoi and Wayne Luk", title = "Efficient reconfigurable design for pricing {Asian} options", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "14--20", month = sep, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1926367.1926371", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Arithmetic Asian options are financial derivatives which have the feature of path-dependency: they depend on the entire price path of the underlying asset, rather than just the instantaneous price. This path-dependency makes them difficult to price, as only computationally intensive Monte-Carlo methods can provide accurate prices. This paper proposes an FPGA-accelerated Asian option pricing solution, using a highly-optimised parallel Monte-Carlo architecture. The proposed pipelined design is described parametrically, facilitating its re-use for different technologies.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Horita:2010:FBF, author = "Tadayoshi Horita and Itsuo Takanami", title = "An {FPGA}-based fast classifier with high generalization property", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "21--26", month = sep, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1926367.1926372", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper proposes a scheme to implement classifiers with high generalization properties on FPGAs. The classifiers consist of only combinational logic circuits, which are based on a simple concept, and the VHDL source files which describe the classifiers are generated by a C-language function, tuning VHDL notations for adders in them to reduce both its hardware size and computation time. Simulation results based on a character recognition are shown in terms of generalization property, hardware size, computation time, and electricity consumption.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Putnam:2010:DVE, author = "Andrew Putnam and Aaron Smith and Doug Burger", title = "Dynamic vectorization in the {E2} dynamic multicore architecture", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "27--32", month = sep, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1926367.1926373", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Previous research has shown that Explicit Data Graph Execution (EDGE) instruction set architectures (ISA) allow for power efficient performance scaling. In this paper we describe the preliminary design of a new dynamic multicore processor called E2 that utilizes an EDGE ISA to allow for the dynamic composition of physical cores into logical processors. We provide details of E2's support for dynamic reconfigurability and show how the EDGE ISA facilities out-of-order vector execution.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Paek:2010:BAU, author = "Jong Kyung Paek and Kiyoung Choi and Jongeun Lee", title = "Binary acceleration using coarse-grained reconfigurable architecture", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "33--39", month = sep, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1926367.1926374", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Coarse-grained reconfigurable architectures (CGRAs) have been well-researched and shown to be particularly effective in acceleration of data-intensive applications. However, practical difficulties in application mapping have hindered their widespread adoption. Typically, an application must be modified manually or by using special compilers and design tools in order to fully exploit the architecture. This incurs considerable design costs to the application developer and reduces software portability. In this paper, we propose a framework for automatic transformation of an application at binary-level, with which the user can execute an arbitrary application on the CGRA. Our approach analyzes the binary code and determines which portions of the program to accelerate, maps them to the reconfigurable array, then modifies the binary code appropriately to run on the CGRA.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dohi:2010:IPE, author = "Keisuke Dohi and Yuichiro Shibata and Tsuyoshi Hamada and Tomonari Masada and Kiyoshi Oguri and Duncan A. Buell", title = "Implementation of a programming environment with a multithread model for reconfigurable systems", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "40--45", month = sep, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1926367.1926375", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Reconfigurable systems are known to be able to achieve higher performance than traditional microprocessor architecture for many application fields. However, in order to extract a full potential of the reconfigurable systems, programmers often have to design and describe the best suited code for their target architecture with specialized knowledge. The aim of this paper is to assist the users of reconfigurable systems by implementing a translator with a multithread model. The experimental results show our translator automatically generates efficient performance-aware code segments including DMA transfer and shift registers for memory access optimization.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sabeghi:2010:RMS, author = "Mojtaba Sabeghi and Hamid Mushtaq and Koen Bertels", title = "Runtime multitasking support on polymorphic platforms", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "46--52", month = sep, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1926367.1926376", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "General purpose computers are moving towards employing reconfigurable fabrics in order to achieve higher performance. In such systems, serving several applications at runtime is a challenging problem in which the reconfigurable fabric has to be shared among competing tasks. Because of the inherent complexity of mapping the computation intensive tasks into the FPGA, a comprehensive runtime system is required to address all the conflicting issues between competing applications' demands and to keep the system performance at the required level. In this paper, we present a runtime environment wherein a number of components introduced to handle the task assignment problem in a very low overhead manner.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tsoi:2010:PFC, author = "Kuen Hung Tsoi and Anson H. T. Tse and Peter Pietzuch and Wayne Luk", title = "Programming framework for clusters with heterogeneous accelerators", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "53--59", month = sep, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1926367.1926377", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We describe a programming framework for high performance clusters with various hardware accelerators. In this framework, users can utilize the available heterogeneous resources productively and efficiently. The distributed application is highly modularized to support dynamic system configuration with changing types and number of the accelerators. Multiple layers of communication interface are introduced to reduce the overhead in both control messages and data transfers. Parallelism can be achieved by controlling the accelerators in various schemes through scheduling extension. The framework has been used to support physics simulation and financial application development.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tadonki:2010:ECL, author = "Claude Tadonki and Gilbert Grodidier and Olivier Pene", title = "An efficient {CELL} library for lattice quantum chromodynamics", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "60--65", month = sep, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1926367.1926378", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Quantum chromodynamics (QCD) is the theory of subnuclear physics, aiming at modeling the strong nuclear force, which is responsible for the interactions of nuclear particles. Numerical QCD studies are performed through a discrete formalism called LQCD (Lattice Quantum Chromodynamics). Typical simulations involve very large volume of data and numerically sensitive entities, thus the crucial need of high performance computing systems. We propose a set of CELL-accelerated routines for basic LQCD calculations. Our framework is provided as a unified library and is particularly optimized for an iterative use. Each routine is parallelized among the SPUs, and each SPU achieves it task by looping on small chunk of arrays from the main memory.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Taylor:2010:SBB, author = "Ryan Taylor and Xiaoming Li", title = "Software-based branch predication for {AMD GPUs}", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "66--72", month = sep, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1926367.1926379", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Branch predication is a program transformation technique that combines instructions of multiple branches of an if statement into a straight-line sequence and associates each instruction of the sequence with a predicate. The branch predication improves the execution of branch statements on processors that support predicated execution of instruction, e.g., Intel IA-64, because such transformation improves the instruction scheduling and might help cache performance. This paper proposes a novel software-based branch predication technique for GPU. The main motivation is that branch instructions can easily become a performance bottleneck for a GPU program because of the cost of branch instructions compared to ALU instructions and the possibility of low ALU utilization due to separation of ALU instructions within control flow blocks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Banescu:2010:MFP, author = "Sebastian Banescu and Florent de Dinechin and Bogdan Pasca and Radu Tudoran", title = "Multipliers for floating-point double precision and beyond on {FPGAs}", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "73--79", month = sep, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1926367.1926380", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The implementation of high-precision floating-point applications on reconfigurable hardware requires large multipliers. Full multipliers are the core of floating-point multipliers. Truncated multipliers, trading resources for a well-controlled accuracy degradation, are useful building blocks in situations where a full multiplier is not needed.\par This work studies the automated generation of such multipliers using the embedded multipliers and adders present in the DSP blocks of current FPGAs. The optimization of such multipliers is expressed as a tiling problem, where a tile represents a hardware multiplier, and super-tiles represent combinations of several hardware multipliers and adders, making efficient use of the DSP internal resources. This tiling technique is shown to adapt to full or truncated multipliers. It addresses arbitrary precisions including single, double but also the quadruple precision introduced by the IEEE-754-2008 standard and currently unsupported by processor hardware. An open-source implementation is provided in the FloPoCo project.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sano:2010:PIA, author = "Kentaro Sano and Luzhou Wang and Satoru Yamamoto", title = "Prototype implementation of array-processor extensible over multiple {FPGAs} for scalable stencil computation", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "80--86", month = sep, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1926367.1926381", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper demonstrates and evaluates the performance and the scalability of the systolic computational-memory array (SCMA) for stencil computation, which is a typical computing kernel of scientific simulation. We describe the basic architecture of th SCMA, and show the requirements and the design of SCMAs to scalably operate over multiple devices. We implement a prototype of the SCMA with three ALTERA Stratix III FPGAs, which form a 1--3 FPGA array by connecting three DE3 boards with different clock sources. The prototype SCMA demonstrates that the difference in operating clock frequency hardly influences the total execution cycles while it slightly causes stall cycles to sub-SCMAs on different FPGAs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tsang:2010:DPR, author = "Chi-Chiu Tsang and Hayden Kwok-Hay So", title = "Dynamic power reduction of {FPGA}-based reconfigurable computers using precomputation", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "87--92", month = sep, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1926367.1926382", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper examines the effectiveness of employing precomputation techniques to reduce power consumption of field configurable computing systems. Multiplier is modified with precomputation techniques and are implemented using commercial off-the-shelf FPGAs. Precomputation techniques reduce dynamic power consumption of a module by eliminating unnecessary signal switching activities in inactive portions of the modules. Experiments have shown that up to 52\% of logic and signal power consumption can be reduced in multiplier module. Furthermore, when compared to ASIC implementations, FPGA implementations of precomputation modules have the advantage of lower area overhead as most of them can be implemented using originally unoccupied related FPGA resources.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2010:INb, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "93--96", month = sep, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1926367.1926384", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mukherjee:2010:NAC, author = "Manideepa Mukherjee and Amitabha Sinha", title = "A novel architecture for conversion of binary to single digit double base numbers", journal = j-COMP-ARCH-NEWS, volume = "38", number = "5", pages = "1--6", month = dec, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1978907.1978909", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 13 11:25:46 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Double base number systems are increasingly attractive for many compute intensive applications especially in signal processing because of their capabilities of handling arithmetic operations efficiently. However, the complexity involved in converting binary to DBNS becomes a major bottleneck and the efficiency of performance goes down drastically due to the complexity involved in conversion. Since complexity of multi digit DBNS multiplications and additions increases with the number of digits (index i,j), in this paper a novel conversion scheme has been proposed where a given binary number will be converted to a single digit (index i,j) double base number. The proposed scheme not only reduces the hardware complexity of the arithmetic operations but also reduces the time of execution.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{T:2010:DDF, author = "Shobha T. and Syed Akram and G. Varaprasad", title = "Design and development of framework for diagnosing intermediate nodes", journal = j-COMP-ARCH-NEWS, volume = "38", number = "5", pages = "7--11", month = dec, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1978907.1978910", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 13 11:25:46 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A framework is an integrated system that sets the rules of Automation of a specific product. This system integrates the function libraries, test data sources, object details and various reusable modules. This paper proposes a framework, used for diagnosing and performance analysis of intermediate network nodes such as load balancer, routers, servers etc. For analyzing the performance $m$ number of servers and $n$ number of clients are considered. This framework will help developers working on network nodes to check for the performance of network node component and also to detect the errors in the algorithms.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tabba:2010:ACP, author = "Fuad Tabba", title = "Adding concurrency in {Python} using a commercial processor's hardware transactional memory support", journal = j-COMP-ARCH-NEWS, volume = "38", number = "5", pages = "12--19", month = dec, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1978907.1978911", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 13 11:25:46 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper reports on our experiences of using a commercial processor's best-effort hardware transactional memory to improve concurrency in CPython, the reference Python implementation. CPython protects its data structures using a single global lock, which inhibits parallelism when running multiple threads.\par We modified the CPython interpreter to use best-effort hardware transactions available in Sun's Rock processor, and fall back on the single global lock when unable to commit in hardware. The modifications were minimal; however, we had to restructure some of CPython's shared data structures to handle false conflicts arising from CPython's management of the shared data. Our results show that the modified CPython interpreter can run small, simple, workloads and scale almost linearly, while improving the concurrency of more complex workloads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thomasian:2010:WSD, author = "Alexander Thomasian", title = "Why specialized disks for composite operations may be unnecessary", journal = j-COMP-ARCH-NEWS, volume = "38", number = "5", pages = "20--27", month = dec, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1978907.1978912", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 13 11:25:46 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Disk arrays with erasure coding such as RAID5 and RAID6 incur four and six disk accesses respectively for updating data and check blocks. The small write penalty can be reduced by the Read-Modify-Write (RMW) composite operations to update data and associated check blocks. The Disk Architecture with Composite Operation (DACO) is a proposal to eliminate the disk rotation associated with RMWs, by using a complex read/write head, which allows the writing of a block immediately after reading and modifying it without needing an extra disk rotation. We argue that the extra cost associated with DACO may not be justifiable, because it is not expected to have a significant impact on RAID performance. Furthermore an XOR capability is still required at the disk array controller for reconstructing missing data blocks. A duplexed Nonvolatile Storage (NVS) cache at the disk array controller provides the same reliability as magnetic disks and allows fast writes, i.e., writing to disk is considered completed as soon as data is written onto NVS. Deferring the destaging of data blocks from NVS allows these blocks to be overwritten, obviating unnecessary disk writes. This also allows neighboring dirty blocks to be destaged in batches, so that a higher disk access efficiency is attained. Disks with multiple arms can also be used to make the processing of RMW requests more efficient, while disks with multiple R/W heads on one arm have little effect on RMW requests. In addition there are alternative methods to update check blocks, such as floating parities, parity logging, the reconstruct write method, log structured arrays, and variable scope parity protection.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2010:INc, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "38", number = "5", pages = "28--36", month = dec, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1978907.1978914", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 13 11:25:46 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Larus:2011:CWC, author = "James R. Larus", title = "The cloud will change everything", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "1--2", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950367", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yuan:2011:ISD, author = "Ding Yuan and Jing Zheng and Soyeon Park and Yuanyuan Zhou and Stefan Savage", title = "Improving software diagnosability via log enhancement", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "3--14", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950369", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Veeraraghavan:2011:DPS, author = "Kaushik Veeraraghavan and Dongyoon Lee and Benjamin Wester and Jessica Ouyang and Peter M. Chen and Jason Flinn and Satish Narayanasamy", title = "{DoublePlay}: parallelizing sequential logging and replay", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "15--26", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950370", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Casper:2011:HAT, author = "Jared Casper and Tayo Oguntebi and Sungpack Hong and Nathan G. Bronson and Christos Kozyrakis and Kunle Olukotun", title = "Hardware acceleration of transactional memory on commodity systems", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "27--38", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950372", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dalessandro:2011:HNC, author = "Luke Dalessandro and Fran{\c{c}}ois Carouge and Sean White and Yossi Lev and Mark Moir and Michael L. Scott and Michael F. Spear", title = "Hybrid {NOrec}: a case study in the effectiveness of best effort hardware transactional memory", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "39--52", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950373", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Singh:2011:EPS, author = "Abhayendra Singh and Daniel Marino and Satish Narayanasamy and Todd Millstein and Madan Musuvathi", title = "Efficient processor support for {DRFx}, a memory model with exceptions", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "53--66", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950375", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Devietti:2011:RRC, author = "Joseph Devietti and Jacob Nelson and Tom Bergan and Luis Ceze and Dan Grossman", title = "{RCDC}: a relaxed consistency deterministic computer", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "67--78", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950376", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Burnim:2011:SCS, author = "Jacob Burnim and George Necula and Koushik Sen", title = "Specifying and checking semantic atomicity for multithreaded programs", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "79--90", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950377", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Volos:2011:MLP, author = "Haris Volos and Andres Jaan Tack and Michael M. Swift", title = "{Mnemosyne}: lightweight persistent memory", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "91--104", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950379", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Coburn:2011:NHM, author = "Joel Coburn and Adrian M. Caulfield and Ameen Akel and Laura M. Grupp and Rajesh K. Gupta and Ranjit Jhala and Steven Swanson", title = "{NV-Heaps}: making persistent objects fast and safe with next-generation, non-volatile memories", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "105--118", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950380", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Schupbach:2011:DLA, author = "Adrian Sch{\"u}pbach and Andrew Baumann and Timothy Roscoe and Simon Peter", title = "A declarative language approach to device configuration", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "119--132", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950382", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ryzhyk:2011:IDD, author = "Leonid Ryzhyk and John Keys and Balachandra Mirla and Arun Raghunath and Mona Vij and Gernot Heiser", title = "Improved device driver reliability through hardware verification reuse", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "133--144", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950383", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hashmi:2011:CNI, author = "Atif Hashmi and Andrew Nere and James Jamal Thomas and Mikko Lipasti", title = "A case for neuromorphic {ISAs}", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "145--158", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950385", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ransford:2011:MSS, author = "Benjamin Ransford and Jacob Sorber and Kevin Fu", title = "{Mementos}: system support for long-running computation on {RFID}-scale devices", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "159--170", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950386", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Koukoumidis:2011:PC, author = "Emmanouil Koukoumidis and Dimitrios Lymberopoulos and Karin Strauss and Jie Liu and Doug Burger", title = "Pocket cloudlets", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "171--184", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950387", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sharma:2011:BMS, author = "Navin Sharma and Sean Barker and David Irwin and Prashant Shenoy", title = "{Blink}: managing server clusters on intermittent power", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "185--198", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950389", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hoffmann:2011:DKR, author = "Henry Hoffmann and Stelios Sidiroglou and Michael Carbin and Sasa Misailovic and Anant Agarwal and Martin Rinard", title = "Dynamic knobs for responsive power-aware computing", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "199--212", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950390", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Liu:2011:FSD, author = "Song Liu and Karthik Pattabiraman and Thomas Moscibroda and Benjamin G. Zorn", title = "{Flikker}: saving {DRAM} refresh-power through critical data partitioning", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "213--224", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950391", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Deng:2011:MAL, author = "Qingyuan Deng and David Meisner and Luiz Ramos and Thomas F. Wenisch and Ricardo Bianchini", title = "{MemScale}: active low-power modes for main memory", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "225--238", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950392", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gao:2011:TMH, author = "Qi Gao and Wenbin Zhang and Zhezhe Chen and Mai Zheng and Feng Qin", title = "{2ndStrike}: toward manifesting hidden concurrency typestate bugs", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "239--250", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950394", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhang:2011:CDC, author = "Wei Zhang and Junghee Lim and Ramya Olichandran and Joel Scherpelz and Guoliang Jin and Shan Lu and Thomas Reps", title = "{ConSeq}: detecting concurrency bugs through sequential errors", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "251--264", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950395", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chipounov:2011:SPV, author = "Vitaly Chipounov and Volodymyr Kuznetsov and George Candea", title = "{S2E}: a platform for in-vivo multi-path analysis of software systems", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "265--278", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950396", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hofmann:2011:EOS, author = "Owen S. Hofmann and Alan M. Dunn and Sangman Kim and Indrajit Roy and Emmett Witchel", title = "Ensuring operating system kernel integrity with {OSck}", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "279--290", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950398", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Porter:2011:RLT, author = "Donald E. Porter and Silas Boyd-Wickizer and Jon Howell and Reuben Olinsky and Galen C. Hunt", title = "Rethinking the library {OS} from the top down", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "291--304", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950399", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Palix:2011:FLT, author = "Nicolas Palix and Ga{\"e}l Thomas and Suman Saha and Christophe Calv{\`e}s and Julia Lawall and Gilles Muller", title = "Faults in {Linux}: ten years later", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "305--318", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950401", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In 2001, Chou et al. published a study of faults found by applying a static analyzer to Linux versions 1.0 through 2.4.1. A major result of their work was that the drivers directory contained up to 7 times more of certain kinds of faults than other directories. This result inspired a number of development and research efforts on improving the reliability of driver code. Today Linux is used in a much wider range of environments, provides a much wider range of services, and has adopted a new development and release model. What has been the impact of these changes on code quality? Are drivers still a major problem?\par To answer these questions, we have transported the experiments of Chou et al. to Linux versions 2.6.0 to 2.6.33, released between late 2003 and early 2010. We find that Linux has more than doubled in size during this period, but that the number of faults per line of code has been decreasing. And, even though drivers still accounts for a large part of the kernel code and contains the most faults, its fault rate is now below that of other directories, such as arch (HAL) and fs (file systems). These results can guide further development and research efforts. To enable others to continually update these results as Linux evolves, we define our experimental protocol and make our checkers and results available in a public archive.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Esmaeilzadeh:2011:LBL, author = "Hadi Esmaeilzadeh and Ting Cao and Yang Xi and Stephen M. Blackburn and Kathryn S. McKinley", title = "Looking back on the language and hardware revolutions: measured power, performance, and scaling", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "319--332", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950402", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nguyen:2011:SCS, author = "Donald Nguyen and Keshav Pingali", title = "Synthesizing concurrent schedulers for irregular algorithms", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "333--344", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950404", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hoang:2011:ECT, author = "Giang Hoang and Robby Bruce Findler and Russ Joseph", title = "Exploring circuit timing-aware language and compilation", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "345--356", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950405", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Farhad:2011:OAM, author = "Sardar M. Farhad and Yousun Ko and Bernd Burgstaller and Bernhard Scholz", title = "Orchestration by approximation: mapping stream programs onto multicore architectures", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "357--368", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950406", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhang:2011:FED, author = "Eddy Z. Zhang and Yunlian Jiang and Ziyu Guo and Kai Tian and Xipeng Shen", title = "On-the-fly elimination of dynamic irregularities for {GPU} computing", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "369--380", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950408", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hormati:2011:SPS, author = "Amir H. Hormati and Mehrzad Samadi and Mark Woh and Trevor Mudge and Scott Mahlke", title = "{Sponge}: portable stream programming on graphics engines", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "381--392", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950409", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kamruzzaman:2011:ICP, author = "Md Kamruzzaman and Steven Swanson and Dean M. Tullsen", title = "Inter-core prefetching for multicore processors using migrating helper threads", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "393--404", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950411", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hayashizaki:2011:IPT, author = "Hiroshige Hayashizaki and Peng Wu and Hiroshi Inoue and Mauricio J. Serrano and Toshio Nakatani", title = "Improving the performance of trace-based systems by false loop filtering", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "405--418", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950412", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Binkert:2011:GS, author = "Nathan Binkert and Bradford Beckmann and Gabriel Black and Steven K. Reinhardt and Ali Saidi and Arkaprava Basu and Joel Hestness and Derek R. Hower and Tushar Krishna and Somayeh Sardashti and Rathijit Sen and Korey Sewell and Muhammad Shoaib and Nilay Vaish and Mark D. Hill and David A. Wood", title = "The {\tt gem5} simulator", journal = j-COMP-ARCH-NEWS, volume = "39", number = "2", pages = "1--7", month = may, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024716.2024718", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Sep 1 17:35:28 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The gem5 simulation infrastructure is the merger of the best aspects of the M5 [4] and GEMS [9] simulators. M5 provides a highly configurable simulation framework, multiple ISAs, and diverse CPU models. GEMS complements these features with a detailed and flexible memory system, including support for multiple cache coherence protocols and interconnect models. Currently, gem5 supports most commercial ISAs (ARM, ALPHA, MIPS, Power, SPARC, and x86), including booting Linux on three of them (ARM, ALPHA, and x86). The project is the result of the combined efforts of many academic and industrial institutions, including AMD, ARM, HP, MIPS, Princeton, MIT, and the Universities of Michigan, Texas, and Wisconsin.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thomasian:2011:SAD, author = "Alexander Thomasian", title = "Survey and analysis of disk scheduling methods", journal = j-COMP-ARCH-NEWS, volume = "39", number = "2", pages = "8--25", month = may, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024716.2024719", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Sep 1 17:35:28 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Performance of many important computer applications depends on the performance of Hard Disk Drives (HDDs). Disk capacities and transfer rates have been increasing rapidly, but the improvement in disk access time is disappointingly slow. Caching and prefetching are two method to alleviate this delay, which is 6-7 orders of magnitude longer than the processor cycle time. Disk scheduling is desirable when the data is not cached and a disk access is required. This paper is concerned with the analysis of two disk arm scheduling methods: SATF (shortest access time first) which outperforms SCAN, while both methods outperform FCFS scheduling. We propose improvements to a recent analysis of the SCAN policy and carry out an empirical investigation of SATF performance to derive a relationship between the queue-length and mean service time.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{K:2011:LPT, author = "Thimmarayaswamy K and Mary M. Dsouza and G. Varaprasad", title = "Low power techniques for an {Android} based phone", journal = j-COMP-ARCH-NEWS, volume = "39", number = "2", pages = "26--35", month = may, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024716.2024720", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Thu Sep 1 17:35:28 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Android is the latest trend in mobile operating systems. Even though Android provides a complete set of application, middleware and Linux kernel for the phone applications developer, it does not fully utilize several standard kernel features. This work attempts to address the limitations of Android specific to power management at kernel level and proposes possible solutions for active and static power management in Linux to overcome these limitations. The developed solutions for active power management include selection of suitable governor algorithm and modification of its parameters and implementation of a daemon process, which performs voltage and frequency scaling. Application level low power techniques for Android are also proposed to help application developers to optimize their software.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2011:INa, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "39", number = "2", pages = "36--52", month = may, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024716.2024722", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 1 17:35:28 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hashmi:2011:AAF, author = "Atif Hashmi and Hugues Berry and Olivier Temam and Mikko Lipasti", title = "Automatic abstraction and fault tolerance in cortical microachitectures", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "1--10", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000066", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Choudhary:2011:FCS, author = "Niket K. Choudhary and Salil V. Wadhavkar and Tanmay A. Shah and Hiran Mayukh and Jayneel Gandhi and Brandon H. Dwiel and Sandeep Navada and Hashem H. Najaf-abadi and Eric Rotenberg", title = "{FabScalar}: composing synthesizable {RTL} designs of arbitrary cores within a canonical superscalar template", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "11--22", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000067", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gunadi:2011:CCR, author = "Erika Gunadi and Mikko H. Lipasti", title = "{CRIB}: consolidated rename, issue, and bypass", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "23--32", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000068", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Agarwal:2011:FIF, author = "Rishi Agarwal and Josep Torrellas", title = "{FlexBulk}: intelligently forming atomic blocks in blocked-execution multiprocessors to minimize squashes", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "33--44", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000070", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kwon:2011:VPA, author = "Youngjin Kwon and Changdae Kim and Seungryoul Maeng and Jaehyuk Huh", title = "Virtualizing performance asymmetric multi-core systems", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "45--56", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000071", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sanchez:2011:VSE, author = "Daniel Sanchez and Christos Kozyrakis", title = "{Vantage}: scalable and efficient fine-grain cache partitioning", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "57--68", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000073", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mishra:2011:ACI, author = "Asit K. Mishra and Xiangyu Dong and Guangyu Sun and Yuan Xie and N. Vijaykrishnan and Chita R. Das", title = "Architecting on-chip interconnects for stacked {$3$D} {STT-RAM} caches in {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "69--80", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000074", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gaur:2011:BIA, author = "Jayesh Gaur and Mainak Chaudhuri and Sreenivas Subramoney", title = "Bypass and insertion algorithms for exclusive last-level caches", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "81--92", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000075", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cuesta:2011:IED, author = "Blas A. Cuesta and Alberto Ros and Mar{\'\i}a E. G{\'o}mez and Antonio Robles and Jos{\'e} F. Duato", title = "Increasing the effectiveness of directory caches by deactivating coherence for private memory blocks", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "93--104", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000076", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Oh:2011:TSM, author = "Jungju Oh and Milos Prvulovic and Alenka Zajic", title = "{TLSync}: support for multiple fast barriers using on-chip transmission lines", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "105--116", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000078", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Crago:2011:OEM, author = "Neal Clayton Crago and Sanjay Jeram Patel", title = "{OUTRIDER}: efficient memory latency tolerance with decoupled strands", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "117--128", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000079", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:2011:ETB, author = "Yunsup Lee and Rimas Avizienis and Alex Bishara and Richard Xia and Derek Lockhart and Christopher Batten and Krste Asanovi{\'c}", title = "Exploring the tradeoffs between programmability and efficiency in data-parallel accelerators", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "129--140", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000080", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ebrahimi:2011:PAS, author = "Eiman Ebrahimi and Chang Joo Lee and Onur Mutlu and Yale N. Patt", title = "Prefetch-aware shared resource management for multi-core systems", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "141--152", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000081", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Agarwal:2011:RSC, author = "Rishi Agarwal and Pranav Garg and Josep Torrellas", title = "Rebound: scalable checkpointing for coherent shared memory", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "153--164", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000083", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Greathouse:2011:DDS, author = "Joseph L. Greathouse and Zhiqiang Ma and Matthew I. Frank and Ramesh Peri and Todd Austin", title = "Demand-driven software race detection using hardware performance counters", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "165--176", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000084", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chhabra:2011:NSN, author = "Siddhartha Chhabra and Yan Solihin", title = "{i-NVMM}: a secure non-volatile main memory system with incremental encryption", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "177--188", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000086", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tiwari:2011:CUM, author = "Mohit Tiwari and Jason K. Oberg and Xun Li and Jonathan Valamehr and Timothy Levin and Ben Hardekopf and Ryan Kastner and Frederic T. Chong and Timothy Sherwood", title = "Crafting a usable microkernel, processor, and {I/O} system with strict and provable information flow security", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "189--200", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000087", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nomura:2011:SDP, author = "Shuou Nomura and Matthew D. Sinclair and Chen-Han Ho and Venkatraman Govindaraju and Marc de Kruijf and Karthikeyan Sankaralingam", title = "Sampling $+$ {DMR}: practical and low-overhead permanent fault detection", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "201--212", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000089", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sudhakrishnan:2011:REB, author = "Sangeetha Sudhakrishnan and Rigo Dicochea and Jose Renau", title = "Releasing efficient beta cores to market early", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "213--222", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000090", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Manoochehri:2011:CCP, author = "Mehrtash Manoochehri and Murali Annavaram and Michel Dubois", title = "{CPPC}: correctable parity protected cache", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "223--234", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000091", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gebhart:2011:EEM, author = "Mark Gebhart and Daniel R. Johnson and David Tarjan and Stephen W. Keckler and William J. Dally and Erik Lindholm and Kevin Skadron", title = "Energy-efficient mechanisms for managing thread context in throughput processors", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "235--246", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000093", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yu:2011:SDH, author = "Wing-kei S. Yu and Ruirui Huang and Sarah Q. Xu and Sung-En Wang and Edwin Kan and G. Edward Suh", title = "{SRAM--DRAM} hybrid memory with applications to efficient register files in fine-grained multi-threading", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "247--258", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000094", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fu:2011:ATM, author = "Binzhang Fu and Yinhe Han and Jun Ma and Huawei Li and Xiaowei Li", title = "An abacus turn model for time\slash space-efficient reconfigurable routing", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "259--270", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000096", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Carpenter:2011:CGS, author = "Aaron Carpenter and Jianyun Hu and Jie Xu and Michael Huang and Hui Wu", title = "A case for globally shared-medium on-chip interconnect", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "271--282", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000097", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tang:2011:IMS, author = "Lingjia Tang and Jason Mars and Neil Vachharajani and Robert Hundt and Mary Lou Soffa", title = "The impact of memory subsystem resource sharing on datacenter applications", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "283--294", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000099", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yoon:2011:AGM, author = "Doe Hyun Yoon and Min Kyu Jeong and Mattan Erez", title = "Adaptive granularity memory systems: a tradeoff between storage efficiency and throughput", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "295--306", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000100", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Barr:2011:SMS, author = "Thomas W. Barr and Alan L. Cox and Scott Rixner", title = "{SpecTLB}: a mechanism for speculative address translation", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "307--318", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000101", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Meisner:2011:PMO, author = "David Meisner and Christopher M. Sadler and Luiz Andr{\'e} Barroso and Wolf-Dietrich Weber and Thomas F. Wenisch", title = "Power management of online data-intensive services", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "319--330", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000103", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Biswas:2011:FFF, author = "Susmit Biswas and Mohit Tiwari and Timothy Sherwood and Luke Theogarajan and Frederic T. Chong", title = "Fighting fire with fire: modeling the datacenter-scale effects of targeted superlattice thermal management", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "331--340", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000104", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Govindan:2011:BLT, author = "Sriram Govindan and Anand Sivasubramaniam and Bhuvan Urgaonkar", title = "Benefits and limitations of tapping into stored energy for datacenters", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "341--352", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000105", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Demme:2011:RIA, author = "John Demme and Simha Sethumadhavan", title = "Rapid identification of architectural bottlenecks via precise event counting", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "353--364", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000107", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Esmaeilzadeh:2011:DSE, author = "Hadi Esmaeilzadeh and Emily Blem and Renee {St. Amant} and Karthikeyan Sankaralingam and Doug Burger", title = "Dark silicon and the end of multicore scaling", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "365--376", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000108", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sun:2011:MME, author = "Guangyu Sun and Christopher J. Hughes and Changkyu Kim and Jishen Zhao and Cong Xu and Yuan Xie and Yen-Kuang Chen", title = "{Moguls}: a model to explore the memory hierarchy for bandwidth improvements", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "377--388", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000109", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mishra:2011:CHC, author = "Asit K. Mishra and N. Vijaykrishnan and Chita R. Das", title = "A case for heterogeneous on-chip interconnects for {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "389--400", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000111", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Grot:2011:KNH, author = "Boris Grot and Joel Hestness and Stephen W. Keckler and Onur Mutlu", title = "{Kilo-NOC}: a heterogeneous network-on-chip architecture for scalability and service guarantees", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "401--412", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000112", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ma:2011:DER, author = "Sheng Ma and Natalie Enright Jerger and Zhiying Wang", title = "{DBAR}: an efficient routing algorithm to support multiple concurrent applications in networks-on-chip", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "413--424", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000113", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Udipi:2011:CMC, author = "Aniruddha N. Udipi and Naveen Muralimanohar and Rajeev Balasubramonian and Al Davis and Norman P. Jouppi", title = "Combining memory and a controller with photonics through {$3$D}-stacking to enable scalable and energy-efficient systems", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "425--436", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000115", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Binkert:2011:ROF, author = "Nathan Binkert and Al Davis and Norman P. Jouppi and Moray McLaren and Naveen Muralimanohar and Robert Schreiber and Jung Ho Ahn", title = "The role of optics in future high radix switch design", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "437--448", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000116", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ma:2011:SPC, author = "Kai Ma and Xue Li and Ming Chen and Xiaorui Wang", title = "Scalable power control for many-core architectures running multi-threaded applications", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "449--460", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000117", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Alameldeen:2011:EEC, author = "Alaa R. Alameldeen and Ilya Wagner and Zeshan Chishti and Wei Wu and Chris Wilkerson and Shih-Lien Lu", title = "Energy-efficient cache design using variable-strength error-correcting codes", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "461--472", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000118", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Barroso:2011:WSC, author = "Luiz Andre Barroso", title = "Warehouse-Scale Computing: Entering the Teenage Decade", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "??--??", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2019527", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ferrucci:2011:IWD, author = "David A. Ferrucci", title = "{IBM}'s {Watson\slash DeepQA}", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "??--??", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2019525", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kannan:2011:ARH, author = "Ravi Kannan", title = "Algorithms: Recent Highlights and Challenges", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "??--??", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2019526", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Leeser:2011:CWP, author = "Miriam Leeser and Devon Yablonski and Dana Brooks and Laurie Smith King", title = "The challenges of writing portable, correct and high performance libraries for {GPUs}", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "2--7", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082158", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Graphics Processing Units (GPUs) are widely used to accelerate scientific applications. Many successes have been reported with speedups of two or three orders of magnitude over serial implementations of the same algorithms. These speedups typically pertain to a specific implementation with fixed parameters mapped to a specific hardware implementation. The implementations are not designed to be easily ported to other GPUs, even from the same manufacturer. When target hardware changes, the application must be re-optimized. In this paper we address a different problem. We aim to deliver working, efficient GPU code in a library that is downloaded and run by many different users.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tsoi:2011:PPO, author = "Kuen Hung Tsoi and Wayne Luk", title = "Power profiling and optimization for heterogeneous multi-core systems", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "8--13", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082159", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Processing speed and energy efficiency are two of the most critical issues for computer systems. This paper presents a systematic approach for profiling the power and performance characteristics of application targeting heterogeneous multi-core computing platforms. Our approach enables rapid and automated design space exploration involving optimisation of workload distribution for systems with accelerators such as FPGAs and GPUs. We demonstrate that, with minor modification to the design, it is possible to estimate performance and power efficiency trade off to identify optimized workload distribution.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Georgescu:2011:GAC, author = "Serban Georgescu and Peter Chow", title = "{GPU} accelerated {CAE} using open solvers and the cloud", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "14--19", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082161", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "After more than five years since GPUs were first used as accelerators for general scientific computations, the field of General Purpose GPU computing or GPGPU has finally reached mainstream. Developers have now access to a mature hardware and software ecosystem. On the software side, several major open-source packages now support GPU acceleration while on the hardware side cloud-based solutions provide a simple way to access powerful machines with the latest GPUs at low cost. In this context, we look at the GPU acceleration of CAE, with a focus on the matrix solvers. We compare the performance that can be achieved using the open-source solver package PETSc ran on GPU-enabled Amazon EC2 hardware with that of an optimized legacy FEM code ran on a last generation 12-core blade server.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:2011:DSE, author = "Junying Chen and Billy Y. S. Yiu and Brandon K. Hamilton and Alfred C. H. Yu and Hayden K.-H. So", title = "Design space exploration of adaptive beamforming acceleration for bedside and portable medical ultrasound imaging", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "20--25", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082162", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The use of adaptive beamforming is a viable solution to provide high-resolution real-time medical ultrasound imaging. However, the increase in image resolution comes at an expense of a significant increase in compute requirement over conventional algorithms. In a bedside diagnosis setting where plug-in power is available, GPUs are promising accelerators to address the processing demand. However, in the case of point-of-care diagnostics where portable ultrasound imaging devices must be used, alternative power-efficient computer systems must be employed, possibly at the expense of lower image resolution in order to maintain real-time performance. This paper presents an initial design space exploration on viable compute architectures that might address the drastically different requirements between bedside and portable medical ultrasound imaging systems using adaptive beamforming.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Dohi:2011:GIO, author = "Keisuke Dohi and Yuichiro Shibata and Kiyoshi Oguri and Takafumi Fujimoto", title = "{GPU} implementation and optimization of electromagnetic simulation using the {FDTD} method for antenna designing", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "26--31", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082163", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper describes electromagnetical field simulation using the 3D-FDTD method for antenna designing on a CUDA compatible GPU. We use the Split Perfectly Matched Layer as an absorbing boundary condition. As is well known, the 3D-FDTD method is a kind of stencil computation and is considered better at GPU implementation. In order to find the best blocking size for the target GPU architecture, we empirically explore a design space of blocking size. We also propose a kernel fusing method as one of the efficient optimization methods, which improves the total performance about 10\% at the cost of a small increase in memory usage.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nagatsuka:2011:CER, author = "Tomoyuki Nagatsuka and Yoshito Sakaguchi and Takayuki Matsumura and Kenji Kise", title = "{CoreSymphony}: an efficient reconfigurable multi-core architecture", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "32--37", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082165", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper describes CoreSymphony, a cooperative and reconfigurable superscalar processor architecture that improves single-thread performance in chip multiprocessor. CoreSymphony enables some narrow-issue cores to be fused into a single wide-issue core. In this paper, we describe the problems associated with achieving the cooperative superscalar processor. We then describe techniques by which to overcome these problems. The evaluation results obtained using SPEC2006 benchmarks indicate that four-core fusion achieves 88\% higher IPC than an individual core.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Takamaeda-Yamazaki:2011:FBS, author = "Shinya Takamaeda-Yamazaki and Ryosuke Sasakawa and Yoshito Sakaguchi and Kenji Kise", title = "An {FPGA}-based scalable simulation accelerator for tile architectures", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "38--43", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082166", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "FPGA-based simulation systems can simulate processor behavior in realistic time. In order to practically simulate tile many-core architectures, we propose ScalableCore for prototyping system development using multiple FPGAs. In this paper, we present an FPGA-based platform called ScalableCore system 1.1, which consists of several simulation tiles named ScalableCore Units. Each tile is connected to four neighbor tiles via interface boards called ScalableCore Boards, and so increasing the target number of cores is easy. We also describe useful techniques by which to achieve high scalability of simulation and to implement complicated hardware functions on an FPGA. The developed system simulates the behavior of a tile architecture with DMA communications and NoC 14.2 times faster than a corresponding software-based functional simulator running on a standard computer with an Intel Core2Duo processor.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sano:2011:DSP, author = "Kentaro Sano and Satoru Yamamoto and Yoshiaki Hatsuda", title = "Domain-specific programmable design of scalable streaming-array for power-efficient stencil computation", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "44--49", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082168", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents the domain-specific programmable design of custom computing machines for high-performance stencil computation. Stencil computation is one of the typical kernels in scientific computations, however its low operational-intensity makes the sustained performance limited by memory bandwidth on recent microprocessors and GPUs. So far we have proposed a scalable streaming-array (SSA) of processing elements, which provides almost linear scalability by increasing FPGAs with a constant externalmemory bandwidth. In order to facilitate custom computing and efficiently utilize hardware resources for various and complex stencil-computations, we design programmable SSA with limited but necessary functionality. We show the design concept, the programmable structure and the SIMD instruction set for SSA.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Akamine:2011:IOE, author = "Takayuki Akamine and Kenta Inakagata and Yasunori Osana and Naoyuki Fujita and Hideharu Amano", title = "An implementation of out-of-order execution system for acceleration of computational fluid dynamics on {FPGAs}", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "50--55", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082169", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "CFD is an important tool for designing aircraft components. FaSTAR is one of the most recent CFD program package with various solvers and automatic generation of grid data. However, FaSTAR is difficult to be executed in parallel machines because of its irregular data structure. Here, the surface integral module, one of cores of FaSTAR is implemented in an FPGA for future acceleration using a platform FLOPS-2D. However, even with hardware execution, the pipeline module suffers from frequent stalls caused by irregular and successive memory access. In order to rid of the problem, a data controller for Out-Of-Order execution was designed and implemented.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Liu:2011:EAH, author = "Haisheng Liu and Smail Niar and Yassin El-Hillali and Atika Rivenq", title = "Embedded architecture with hardware accelerator for target recognition in driver assistance system", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "56--59", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082170", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents a new Radar-based recognition system, which is able to identify obstacles during a vehicle movement. Obstacles recognition gives the benefits of avoiding false alarms and allows generating alarms that take into account the identification of the obstacle in front of the vehicle. In this paper, we first identify hotspots in the target recognition application. Then, we propose an optimized version of the multiple target recognition algorithm to respect the real time constraints of the application while simplifying the underlying hardware platform. We also propose a flexible embedded architecture with hardware accelerator that supports the proposed algorithm.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pell:2011:SEF, author = "Oliver Pell and Oskar Mencer", title = "Surviving the end of frequency scaling with reconfigurable dataflow computing", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "60--65", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082172", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Over the past decade x86 processors have come to dominate the world's largest supercomputers. However in the future conventional multicore processors are unlikely to be able to deliver the necessary performance per \$ and per W to achieve exascale performance. Heterogeneous computing is emerging as a powerful alternative to conventional multi-core to help address these challenges. In this paper we describe our approach to Maximum Performance Computing --- building application-specific computers which complement conventional x86 processors with high performance dataflow engines implemented on FPGA to provide 10--100$ \times $ improvements in performance and performance/W.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Balevic:2011:KAD, author = "Ana Balevic and Bart Kienhuis", title = "{KPN2GPU}: an approach for discovery and exploitation of fine-grain data parallelism in process networks", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "66--71", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082173", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With advances in manycore and accelerator architectures, the high performance and embedded spaces are rapidly converging. Emerging architectures feature different forms of parallelism. The Polyhedral Processes Networks (PPNs) are a proven model of choice for automated generation of pipeline and task parallel programs from sequential source code, however data parallelism is not addressed. In this paper, we present a systematic approach for identification and extraction of fine grain data parallelism from the PPN specification. The approach is implemented in a tool, called kpn2gpu, which produces fine-grain data parallel CUDA kernels for graphics processing units (GPUs).", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Akagic:2011:HSC, author = "Amila Akagi{\'c} and Hideharu Amano", title = "High speed {CRC} with 64-bit generator polynomial on an {FPGA}", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "72--77", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082175", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Deployment of jumbo frame sizes beyond 9000 bytes for storage systems is limited by 32-bit Cyclic Redundancy Checks used by a network protocol. In order to overcome this limitation we study possibility of using 64-bit polynomials in software and hardware, by using fastest multiple lookup tables algorithms for generating CRCs. CRC is a sequential process, thus the software based solutions are limited in throughput by speed and architectural improvements of a single CPU. We study tradeoff between using distributed LUTs and embedded BRAM in hardware implementations. Our results show that BRAM-based approach is the fastest hardware implementation, reaching maximum of 347.37 Gbps while processing 1024 bits at a time, which is 606x faster than the software implementation of the same algorithm running on Xeon 3.2 GHz with 2 MB of L2 cache.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yang:2011:BPR, author = "Shufan Yang and T. M. McGinnity", title = "A biologically plausible real-time spiking neuron simulation environment based on a multiple-{FPGA} platform", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "78--81", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082176", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Neurological research has revealed that neurons encode information in the timing of spikes. Spiking neural network simulations are a flexible and powerful method for investigating the behaviour of such neuronal systems. The spiking neuron models which are used in simulations can be described mathematically, but the continuous time involved in mathematical models needs to be replaced by discrete time steps. An alternative approach, hardware implementation, provides the possibility of generating independent spikes precisely and simultaneously output spike waves in real biological time, under the premise that the spiking neural network implemented in hardware can take full advantage of hardware-timed speed and reliability.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sawada:2011:PCW, author = "Hiroomi Sawada and Morihiro Kuga and Motoki Amagasaki and Masahiro Iida and Toshinori Sueyoshi", title = "Parallelization of the channel width search for {FPGA} routing", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "82--85", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082177", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tanabe:2011:SFB, author = "Shoji Tanabe and Takuya Nagashima and Yoshiki Yamaguchi", title = "A study of an {FPGA} based flexible {SIMD} processor", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "86--89", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082179", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Trouve:2011:ADA, author = "Antoine Trouve and Kazuaki Murakami", title = "Augmenting {DR-ASIP} flexibility through multi-mode custom instructions", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "90--93", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082180", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper introduces a simple method called multimode custom instructions, which aims at reducing the power consumption of the register file of tightly coupled dynamically reconfigurable application specific instruction set processors (DR-ASIPs). To this end, it proposes to divide custom instructions into two sets depending on criteria related to their size, distribution and reuse rate. Performance is measured on a RISC DR-ASIP with a subset of MiBench using an original automatic custom instruction generator from assembly based on the dancing link algorithm.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kubota:2011:MWS, author = "Shinya Kubota and Minoru Watanabe", title = "A {MEMS} writer system embedded for a programmable optically reconfigurable gate array", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "94--97", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082181", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fousek:2011:AFC, author = "Jan Fousek and Ji{\v{r}}i Filipovi{\v{c}} and Matu{\v{s}} Madzin", title = "Automatic fusions of {CUDA--GPU} kernels for parallel map", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "98--99", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082183", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "When implementing a function mapping on the contemporary GPU, several contradictory performance factors affecting distribution of computation into GPU kernels have to be balanced. A decomposition-fusion scheme suggests to decompose the computational problem to be solved by several simple functions implemented as standalone kernels and to fuse some of these functions later into more complex kernels to improve memory locality. In this paper, a prototype of source-to-source compiler automating the fusion phase is presented and the impact of fusions generated by the compiler as well as compiler efficiency is experimentally evaluated.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Matsunobu:2011:DCE, author = "Kohei Matsunobu and Keisuke Dohi and Yuichiro Shibata and Kiyoshi Oguri", title = "A discussion on calculating eigenvalues of real symmetric tridiagonal matrices on a {GPU}", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "100--101", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082184", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "While GPUs are attracting attention as an accelerator in wide-ranged application areas, compatibility between the architecture and selected algorithm is important to effectively bring out their potential performance. This paper focuses on eigenvalue calculation from a given real symmetric tridiagonal matrix and compares GPU implementations for the QR method and the bisection method. Implementation for a total of four different GPU architectures are shown and compared to reveal the affinity between algorithms and architectures.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Meyer:2011:MRP, author = "Dominik Meyer and Bernd Klauer", title = "Multicore reconfiguration platform an alternative to {RAMPSoC}", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "102--103", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082185", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The current state of the art in processor performance improvement is multicore-processor systems. These systems offer a number of homogeneous and static processor cores for the parallel distribution of computational tasks. A novel idea in this research field is introduced by the Runtime Adaptive Multi-Processor System-on- Chip (RAMPSoC) approach. It uses a dynamic and partial reconfigurable system to offer a heterogeneous multicore-processor system. It is runtime adaptable to applications needs and provides a high degree of freedom for system design and task distribution. The continuation of this idea is the Multicore Reconfiguration Platform (MRP) presented in this paper.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bonamy:2011:PLI, author = "Robin Bonamy and Daniel Chillet and Olivier Sentieys and Sebastien Bilavarn", title = "Parallelism Level Impact on Energy Consumption in Reconfigurable Devices", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "104--105", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082186", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Nowadays, System-on-Chip architectures are composed of several execution resources which support complex applications. As it shares silicon area and limits the cost of the global circuit, the embedding of a reconfigurable resource in these SoC provides flexibility to the hardware. In this case, several implementations of the same algorithm, offering different characteristics, can be considered in order to optimize performances. In general, the tasks mapped on reconfigurable resources are algorithms that can be defined through several levels of parallelism.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Agyeman:2011:PAO, author = "Michael Opoku Agyeman and Ali Ahmadinia", title = "Power and area optimisation in heterogeneous {$3$D} networks-on-chip architectures", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "106--107", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082187", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Three dimensional Network-on-Chip (3D NoC) architectures have evolved with a lot of interest to address the on-chip communication delays of modern SoC systems. However, the vertical interconnections between layers is more power and area hungry compared to 2D interconnections. In this paper we propose area efficient and low power heterogeneous NoC architectures, which combines both the power and performance benefits of 2D routers and 3D NoC-bus hybrid router architectures in 3D mesh topologies. Experimental results show a negligible penalty of up to 5\% in average packet latency of 3D homogeneous NoC with bus hybrid routers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2011:INb, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "39", number = "4", pages = "108--117", month = sep, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2082156.2082189", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 20 17:53:58 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Das:2011:HSR, author = "Malay Das and Amitabha Sinha and Nishant Kumar Giri", title = "High speed residue number system ({RNS}) based {FIR} filter using distributed arithmetic ({DA})", journal = j-COMP-ARCH-NEWS, volume = "39", number = "5", pages = "1--4", month = dec, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2093339.2093341", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Mar 15 14:07:10 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chakraborty:2011:CBS, author = "Anindita Chakraborty and Amitabha Sinha", title = "Conversion of binary to single-term triple base numbers for {DSP} applications", journal = j-COMP-ARCH-NEWS, volume = "39", number = "5", pages = "5--11", month = dec, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2093339.2093342", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Mar 15 14:07:10 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper high speed Residue Number System (RNS) based FIR filter using Distributed Arithmetic (DA) is proposed. The proposed architecture uses the module set having the value of numbers as small as possible. In case of using Distributed Arithmetic in FIR filter; the size of LUTs gets increased exponentially with the increase of tap of the filter. Here care has been taken so that sizes of LUTs do not get increased. The proposed architecture is designed using Verilog HDL; a popular hardware description language [9]. The design is synthesized with ISE 10.1 and implemented on Xilinx's Virtex-4. The proposed architecture is also compared with conventional RNS-DA FIR filter. The results show that the proposed architecture can implement FIR filter with high speed.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Singha:2011:NAF, author = "Satrughna Singha and Aniruddha Ghosh and Amitabha Sinha", title = "A new architecture for {FPGA} based implementation of conversion of binary to double base number system ({DBNS}) using parallel search technique", journal = j-COMP-ARCH-NEWS, volume = "39", number = "5", pages = "12--18", month = dec, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2093339.2093343", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Mar 15 14:07:10 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Non-binary number systems are increasingly gaining popularity in signal processing applications for their capabilities of handling arithmetic operations efficiently. One such number system, ``Double Base Number System (DBNS)'' has gained attention to many researchers for it's capability of performing multiplication operation efficiently. Recently, ``Triple Base Number System (TBNS)'' has been introduced which shows better performance over DBNS for higher bit operations in terms of speed, hardware complexity and power dissipation. However, the advantages of TBNS systems cannot be exploited due to substantial overhead of conversion from binary to TBNS. Keeping this issue in view, in this paper, a novel architecture has been proposed for high performance binary to TBNS conversion. Efficiency of this conversion scheme has been dealt with in details and experimental results and analysis clearly indicate the novelty of the architecture.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2011:INc, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "39", number = "5", pages = "19--23", month = dec, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2093339.2093345", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Mar 15 14:07:10 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Compute intensive signal Processing Algorithms demand efficient execution of high performance arithmetic operations. Since, double base number system (DBNS) offers high performance arithmetic units, it is gaining attention to many researchers. However, the advantage of DBNS can not be exploited due to large conversion time from binary to DBNS. Keeping this issue in view, this paper presents a novel conversion scheme using parallel search technique.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lymberopoulos:2012:PIW, author = "Dimitrios Lymberopoulos and Oriana Riva and Karin Strauss and Akshay Mittal and Alexandros Ntoulas", title = "{PocketWeb}: instant web browsing for mobile devices", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "1--12", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150978", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "The high network latencies and limited battery life of mobile phones can make mobile web browsing a frustrating experience. In prior work, we proposed trading memory capacity for lower web access latency and a more convenient data transfer schedule from an energy perspective by prefetching slowly-changing data (search queries and results) nightly, when the phone is charging. However, most web content is intrinsically much more dynamic and may be updated multiple times a day, thus eliminating the effectiveness of periodic updates. This paper addresses the challenge of prefetching dynamic web content in a timely fashion, giving the user an instant web browsing experience but without aggravating the battery lifetime issue. We start by analyzing the web access traces of 8,000 users, and observe that mobile web browsing exhibits a strong spatiotemporal signature, which is different for every user. We propose to use a machine learning approach based on stochastic gradient boosting techniques to efficiently model this signature on a per user basis. The machine learning model is capable of accurately predicting future web accesses and prefetching the content in a timely manner. Our experimental evaluation with 48,000 models trained on real user datasets shows that we can accurately prefetch 60\% of the URLs for about 80--90\% of the users within 2 minutes before the request. The system prototype we built not only provides more than 80\% lower web access time for more than 80\% of the users, but it also achieves the same or lower radio energy dissipation by more than 50\% for the majority of mobile users.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lin:2012:RUL, author = "Felix Xiaozhu Lin and Zhen Wang and Robert LiKamWa and Lin Zhong", title = "{Reflex}: using low-power processors in smartphones without knowing them", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "13--24", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150979", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "To accomplish frequent, simple tasks with high efficiency, it is necessary to leverage low-power, microcontroller-like processors that are increasingly available on mobile systems. However, existing solutions require developers to directly program the low-power processors and carefully manage inter-processor communication. We present Reflex, a suite of compiler and runtime techniques that significantly lower the barrier for developers to leverage such low-power processors. The heart of Reflex is a software Distributed Shared Memory (DSM) that enables shared memory objects with release consistency among code running on loosely coupled processors. In order to achieve high energy efficiency without sacrificing performance much, the Reflex DSM leverages (i) extreme architectural asymmetry between low-power processors and powerful central processors, (ii) aggressive compile-time optimization, and (iii) a minimalist runtime that supports efficient message passing and event-driven execution. We report a complete realization of Reflex that runs on a TI OMAP4430-based development platform as well as on a custom tri-processor mobile platform. Using smartphone sensing applications reported in recent literature, we show that Reflex supports a programming style very close to contemporary smartphone programming. Compared to message passing, the Reflex DSM greatly reduces efforts in programming heterogeneous smartphones, eliminating up to 38\% of the source lines of application code. Compared to running the same applications on existing smartphones, Reflex reduces the average system power consumption by up to 81\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chang:2012:TGE, author = "Jichuan Chang and Justin Meza and Parthasarathy Ranganathan and Amip Shah and Rocky Shih and Cullen Bash", title = "Totally green: evaluating and designing servers for lifecycle environmental impact", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "25--36", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150980", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "The environmental impact of servers and datacenters is an important future challenge. System architects have traditionally focused on operational energy as a proxy for designing green servers, but this ignores important environmental implications from server production (materials, manufacturing, etc.). In contrast, this paper argues for a lifecycle focus on the environmental impact of future server designs, to include both operation and production. We present a new methodology to quantify the total environmental impact of system design decisions. Our approach uses the thermodynamic metric of energy consumption, adapted and validated for use by system architects. Using this methodology, we evaluate the lifecycle impact of several example system designs with environment-friendly optimizations. Our results show that environmental impact from production can be important (around 20\% on current servers and growing) and system design choices can reduce this component (by 30--40\%). Our results also highlight several, sometimes unexpected, cross-interactions between the environmental impact of production and operation that further motivate a total lifecycle emphasis for future green server designs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ferdman:2012:CCS, author = "Michael Ferdman and Almutaz Adileh and Onur Kocberber and Stavros Volos and Mohammad Alisafaee and Djordje Jevdjic and Cansu Kaynak and Adrian Daniel Popescu and Anastasia Ailamaki and Babak Falsafi", title = "Clearing the clouds: a study of emerging scale-out workloads on modern hardware", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "37--48", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150982", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Emerging scale-out workloads require extensive amounts of computational resources. However, data centers using modern server hardware face physical constraints in space and power, limiting further expansion and calling for improvements in the computational density per server and in the per-operation energy. Continuing to improve the computational resources of the cloud while staying within physical constraints mandates optimizing server efficiency to ensure that server hardware closely matches the needs of scale-out workloads. In this work, we introduce CloudSuite, a benchmark suite of emerging scale-out workloads. We use performance counters on modern servers to study scale-out workloads, finding that today's predominant processor micro-architecture is inefficient for running these workloads. We find that inefficiency comes from the mismatch between the workload needs and modern processors, particularly in the organization of instruction and data memory systems and the processor core micro-architecture. Moreover, while today's predominant micro-architecture is inefficient when executing scale-out workloads, we find that continuing the current trends will further exacerbate the inefficiency in the future. In this work, we identify the key micro-architectural needs of scale-out workloads, calling for a change in the trajectory of server processors that would lead to improved computational density and power efficiency in data centers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:2012:IOD, author = "Yang Chen and Shuangde Fang and Lieven Eeckhout and Olivier Temam and Chengyong Wu", title = "Iterative optimization for the data center", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "49--60", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150983", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Iterative optimization is a simple but powerful approach that searches for the best possible combination of compiler optimizations for a given workload. However, each program, if not each data set, potentially favors a different combination. As a result, iterative optimization is plagued by several practical issues that prevent it from being widely used in practice: a large number of runs are required for finding the best combination; the process can be data set dependent; and the exploration process incurs significant overhead that needs to be compensated for by performance benefits. Therefore, while iterative optimization has been shown to have significant performance potential, it is seldomly used in production compilers. In this paper, we propose Iterative Optimization for the Data Center (IODC): we show that servers and data centers offer a context in which all of the above hurdles can be overcome. The basic idea is to spawn different combinations across workers and recollect performance statistics at the master, which then evolves to the optimum combination of compiler optimizations. IODC carefully manages costs and benefits, and is transparent to the end user. We evaluate IODC using both MapReduce and throughput compute-intensive server applications. In order to reflect the large number of users interacting with the system, we gather a very large collection of data sets (at least 1000 and up to several million unique data sets per program), for a total storage of 10.7TB, and 568 days of CPU time. We report an average performance improvement of 1.48$ \times $, and up to 2.08$ \times $, for the MapReduce applications, and 1.14$ \times $, and up to 1.39$ \times $, for the throughput compute-intensive server applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ahmad:2012:TOM, author = "Faraz Ahmad and Srimat T. Chakradhar and Anand Raghunathan and T. N. Vijaykumar", title = "{Tarazu}: optimizing {MapReduce} on heterogeneous clusters", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "61--74", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150984", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Data center-scale clusters are evolving towards heterogeneous hardware for power, cost, differentiated price-performance, and other reasons. MapReduce is a well-known programming model to process large amount of data on data center-scale clusters. Most MapReduce implementations have been designed and optimized for homogeneous clusters. Unfortunately, these implementations perform poorly on heterogeneous clusters (e.g., on a 90-node cluster that contains 10 Xeon-based servers and 80 Atom-based servers, Hadoop performs worse than on 10-node Xeon-only or 80-node Atom-only homogeneous sub-clusters for many of our benchmarks). This poor performance remains despite previously proposed optimizations related to management of straggler tasks. In this paper, we address MapReduce's poor performance on heterogeneous clusters. Our first contribution is that the poor performance is due to two key factors: (1) the non-intuitive effect that MapReduce's built-in load balancing results in excessive and bursty network communication during the Map phase, and (2) the intuitive effect that the heterogeneity amplifies load imbalance in the Reduce computation. Our second contribution is Tarazu, a suite of optimizations to improve MapReduce performance on heterogeneous clusters. Tarazu consists of (1) Communication-Aware Load Balancing of Map computation (CALB) across the nodes, (2) Communication-Aware Scheduling of Map computation (CAS) to avoid bursty network traffic and (3) Predictive Load Balancing of Reduce computation (PLB) across the nodes. Using the above 90-node cluster, we show that Tarazu significantly improves performance over a baseline of Hadoop with straightforward tuning for hardware heterogeneity.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Govindan:2012:LSE, author = "Sriram Govindan and Di Wang and Anand Sivasubramaniam and Bhuvan Urgaonkar", title = "Leveraging stored energy for handling power emergencies in aggressively provisioned datacenters", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "75--86", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150985", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Datacenters spend \$10--25 per watt in provisioning their power infrastructure, regardless of the watts actually consumed. Since peak power needs arise rarely, provisioning power infrastructure for them can be expensive. One can, thus, aggressively under-provision infrastructure assuming that simultaneous peak draw across all equipment will happen rarely. The resulting non-zero probability of emergency events where power needs exceed provisioned capacity, however small, mandates graceful reaction mechanisms to cap the power draw instead of leaving it to disruptive circuit breakers/fuses. Existing strategies for power capping use temporal knobs local to a server that throttle the rate of execution (using power modes), and/or spatial knobs that redirect/migrate excess load to regions of the datacenter with more power headroom. We show these mechanisms to have performance degrading ramifications, and propose an entirely orthogonal solution that leverages existing UPS batteries to temporarily augment the utility supply during emergencies. We build an experimental prototype to demonstrate such power capping on a cluster of 8 servers, each with an individual battery, and implement several online heuristics in the context of different datacenter workloads to evaluate their effectiveness in handling power emergencies. We show that: (i) our battery-based solution can handle emergencies of short duration on its own, (ii) supplement existing reaction mechanisms to enhance their efficacy for longer emergencies, and (iii) battery even provide feasible options when other knobs do not suffice.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kadav:2012:UMD, author = "Asim Kadav and Michael M. Swift", title = "Understanding modern device drivers", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "87--98", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150987", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Device drivers are the single largest contributor to operating-system kernel code with over 5 million lines of code in the Linux kernel, and cause significant complexity, bugs and development costs. Recent years have seen a flurry of research aimed at improving the reliability and simplifying the development of drivers. However, little is known about what constitutes this huge body of code beyond the small set of drivers used for research. In this paper, we study the source code of Linux drivers to understand what drivers actually do, how current research applies to them and what opportunities exist for future research. We determine whether assumptions made by most driver research, such as that all drivers belong to a class, are indeed true. We also analyze driver code and abstractions to determine whether drivers can benefit from code re-organization or hardware trends. We develop a set of static-analysis tools to analyze driver code across various axes. Broadly, our study looks at three aspects of driver code (i) what are the characteristics of driver code functionality and how applicable is driver research to all drivers, (ii) how do drivers interact with the kernel, devices, and buses, and (iii) are there similarities that can be abstracted into libraries to reduce driver size and complexity? We find that many assumptions made by driver research do not apply to all drivers. At least 44\% of drivers have code that is not captured by a class definition, 28\% of drivers support more than one device per driver, and 15\% of drivers do significant computation over data. From the driver interactions study, we find USB bus offers an efficient bus interface with significant standardized code and coarse-grained access, ideal for executing drivers in isolation. We also find that drivers for different buses and classes have widely varying levels of device interaction, which indicates that the cost of isolation will vary by class. Finally, from our driver similarity study, we find 8\% of all driver code is substantially similar to code elsewhere and may be removed with new abstractions or libraries.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Panneerselvam:2012:COS, author = "Sankaralingam Panneerselvam and Michael M. Swift", title = "{Chameleon}: operating system support for dynamic processors", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "99--110", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150988", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "The rise of multi-core processors has shifted performance efforts towards parallel programs. However, single-threaded code, whether from legacy programs or ones difficult to parallelize, remains important. Proposed asymmetric multicore processors statically dedicate hardware to improve sequential performance, but at the cost of reduced parallel performance. However, several proposed mechanisms provide the best-of-both-worlds by combining multiple cores into a single, more powerful processor for sequential code. For example, Core Fusion merges multiple cores to pool caches and functional units, and Intel's Turbo Boost raises the clock speed of a core if the other cores on a chip are powered down. These reconfiguration mechanisms have two important properties. First the set of available cores and their capabilities can vary over short time scales. Current operating systems are not designed for rapidly changing hardware: the existing hotplug mechanisms for reconfiguring processors require global operations and hundreds of milliseconds to complete. Second, configurations may be mutually exclusive: using power to speed one core means it cannot be used to speed another. Current schedulers cannot manage this requirement. We present Chameleon, an extension to Linux to support dynamic processors that can reconfigure their cores at runtime. Chameleon provides processor proxies to enable rapid reconfiguration, execution objects to abstract the processing capabilities of physical CPUs, and a cluster scheduler to balance the needs of sequential and parallel programs. In experiments that emulate a dynamic processor, we find that Chameleon can reconfigure processors 100,000 times faster than Linux and allows applications full access to hardware capabilities: sequential code runs at full speed on a powerful execution context, while parallel code runs on as many cores as possible.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hwang:2012:CRD, author = "Andy A. Hwang and Ioan A. Stefanovici and Bianca Schroeder", title = "Cosmic rays don't strike twice: understanding the nature of {DRAM} errors and the implications for system design", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "111--122", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150989", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Main memory is one of the leading hardware causes for machine crashes in today's datacenters. Designing, evaluating and modeling systems that are resilient against memory errors requires a good understanding of the underlying characteristics of errors in DRAM in the field. While there have recently been a few first studies on DRAM errors in production systems, these have been too limited in either the size of the data set or the granularity of the data to conclusively answer many of the open questions on DRAM errors. Such questions include, for example, the prevalence of soft errors compared to hard errors, or the analysis of typical patterns of hard errors. In this paper, we study data on DRAM errors collected on a diverse range of production systems in total covering nearly 300 terabyte-years of main memory. As a first contribution, we provide a detailed analytical study of DRAM error characteristics, including both hard and soft errors. We find that a large fraction of DRAM errors in the field can be attributed to hard errors and we provide a detailed analytical study of their characteristics. As a second contribution, the paper uses the results from the measurement study to identify a number of promising directions for designing more resilient systems and evaluates the potential of different protection mechanisms in the light of realistic error patterns. One of our findings is that simple page retirement policies might be able to mask a large number of DRAM errors in production systems, while sacrificing only a negligible fraction of the total DRAM in the system.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hari:2012:REA, author = "Siva Kumar Sastry Hari and Sarita V. Adve and Helia Naeimi and Pradeep Ramachandran", title = "{Relyzer}: exploiting application-level fault equivalence to analyze application resiliency to transient faults", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "123--134", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150990", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Future microprocessors need low-cost solutions for reliable operation in the presence of failure-prone devices. A promising approach is to detect hardware faults by deploying low-cost monitors of software-level symptoms of such faults. Recently, researchers have shown these mechanisms work well, but there remains a non-negligible risk that several faults may escape the symptom detectors and result in silent data corruptions (SDCs). Most prior evaluations of symptom-based detectors perform fault injection campaigns on application benchmarks, where each run simulates the impact of a fault injected at a hardware site at a certain point in the application's execution (application fault site). Since the total number of application fault sites is very large (trillions for standard benchmark suites), it is not feasible to study all possible faults. Previous work therefore typically studies a randomly selected sample of faults. Such studies do not provide any feedback on the portions of the application where faults were not injected. Some of those instructions may be vulnerable to SDCs, and identifying them could allow protecting them through other means if needed. This paper presents Relyzer, an approach that systematically analyzes all application fault sites and carefully picks a small subset to perform selective fault injections for transient faults. Relyzer employs novel fault pruning techniques that prune faults that need detailed study by either predicting their outcomes or showing them equivalent to other faults. We find that Relyzer prunes about 99.78\% of the total faults across twelve applications studied here, reducing the faults that require detailed simulation by 3 to 5 orders of magnitude for most of the applications. Fault injection simulations on the remaining faults can identify SDC causing faults in the entire application. Some of Relyzer's techniques rely on heuristics to determine fault equivalence. Our validation efforts show that Relyzer determines fault outcomes with 96\% accuracy, averaged across all the applications studied here.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Feiner:2012:CKI, author = "Peter Feiner and Angela Demke Brown and Ashvin Goel", title = "Comprehensive kernel instrumentation via dynamic binary translation", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "135--146", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150992", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Dynamic binary translation (DBT) is a powerful technique that enables fine-grained monitoring and manipulation of an existing program binary. At the user level, it has been employed extensively to develop various analysis, bug-finding, and security tools. Such tools are currently not available for operating system (OS) binaries since no comprehensive DBT framework exists for the OS kernel. To address this problem, we have developed a DBT framework that runs as a Linux kernel module, based on the user-level DynamoRIO framework. Our approach is unique in that it controls all kernel execution, including interrupt and exception handlers and device drivers, enabling comprehensive instrumentation of the OS without imposing any overhead on user-level code. In this paper, we discuss the key challenges in designing and building an in-kernel DBT framework and how the design differs from user-space. We use our framework to build several sample instrumentations, including simple instruction counting as well as an implementation of shadow memory for the kernel. Using the shadow memory, we build a kernel stack overflow protection tool and a memory addressability checking tool. Qualitatively, the system is fast enough and stable enough to run the normal desktop workload of one of the authors for several weeks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Odaira:2012:COA, author = "Rei Odaira and Toshio Nakatani", title = "Continuous object access profiling and optimizations to overcome the memory wall and bloat", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "147--158", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150993", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Future microprocessors will have more serious memory wall problems since they will include more cores and threads in each chip. Similarly, future applications will have more serious memory bloat problems since they are more often written using object-oriented languages and reusable frameworks. To overcome such problems, the language runtime environments must accurately and efficiently profile how programs access objects. We propose Barrier Profiler, a low-overhead object access profiler using a memory-protection-based approach called pointer barrierization and adaptive overhead reduction techniques. Unlike previous memory-protection-based techniques, pointer barrierization offers per-object protection by converting all of the pointers to a given object to corresponding barrier pointers that point to protected pages. Barrier Profiler achieves low overhead by not causing signals at object accesses that are unrelated to the needed profiles, based on profile feedback and a compiler analysis. Our experimental results showed Barrier Profiler provided sufficiently accurate profiles with 1.3\% on average and at most 3.4\% performance overhead for allocation-intensive benchmarks, while previous code-instrumentation-based techniques suffered from 9.2\% on average and at most 12.6\% overhead. The low overhead allows Barrier Profiler to be run continuously on production systems. Using Barrier Profiler, we implemented two new online optimizations to compress write-only character arrays and to adjust the initial sizes of mostly non-accessed arrays. They resulted in speed-ups of up to 8.6\% and 36\%, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Greathouse:2012:CUW, author = "Joseph L. Greathouse and Hongyi Xin and Yixin Luo and Todd Austin", title = "A case for unlimited watchpoints", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "159--172", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150994", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Numerous tools have been proposed to help developers fix software errors and inefficiencies. Widely-used techniques such as memory checking suffer from overheads that limit their use to pre-deployment testing, while more advanced systems have such severe performance impacts that they may require special-purpose hardware. Previous works have described hardware that can accelerate individual analyses, but such specialization stymies adoption; generalized mechanisms are more likely to be added to commercial processors. This paper demonstrates that the ability to set an unlimited number of fine-grain data watchpoints can reduce the runtime overheads of numerous dynamic software analysis techniques. We detail the watchpoint capabilities required to accelerate these analyses while remaining general enough to be useful in the future. We describe a hardware design that stores watchpoints in main memory and utilizes two different on-chip caches to accelerate performance. The first is a bitmap lookaside buffer that stores fine-grained watchpoints, while the second is a range cache that can efficiently hold large contiguous regions of watchpoints. As an example of the power of such a system, it is possible to use watchpoints to accelerate read/write set checks in a software data race detector by nearly 9$ \times $.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Olszewski:2012:AAS, author = "Marek Olszewski and Qin Zhao and David Koh and Jason Ansel and Saman Amarasinghe", title = "{Aikido}: accelerating shared data dynamic analyses", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "173--184", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150995", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Despite a burgeoning demand for parallel programs, the tools available to developers working on shared-memory multicore processors have lagged behind. One reason for this is the lack of hardware support for inspecting the complex behavior of these parallel programs. Inter-thread communication, which must be instrumented for many types of analyses, may occur with any memory operation. To detect such thread communication in software, many existing tools require the instrumentation of all memory operations, which leads to significant performance overheads. To reduce this overhead, some existing tools resort to random sampling of memory operations, which introduces false negatives. Unfortunately, neither of these approaches provide the speed and accuracy programmers have traditionally expected from their tools. In this work, we present Aikido, a new system and framework that enables the development of efficient and transparent analyses that operate on shared data. Aikido uses a hybrid of existing hardware features and dynamic binary rewriting to detect thread communication with low overhead. Aikido runs a custom hypervisor below the operating system, which exposes per-thread hardware protection mechanisms not available in any widely used operating system. This hybrid approach allows us to benefit from the low cost of detecting memory accesses with hardware, while maintaining the word-level accuracy of a software-only approach. To evaluate our framework, we have implemented an Aikido-enabled vector clock race detector. Our results show that the Aikido enabled race-detector outperforms existing techniques that provide similar accuracy by up to 6.0x, and 76\% on average, on the PARSEC benchmark suite.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kasikci:2012:DRV, author = "Baris Kasikci and Cristian Zamfir and George Candea", title = "Data races vs. data race bugs: telling the difference with {Portend}", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "185--198", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150997", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Even though most data races are harmless, the harmful ones are at the heart of some of the worst concurrency bugs. Alas, spotting just the harmful data races in programs is like finding a needle in a haystack: 76\%--90\% of the true data races reported by state-of-the-art race detectors turn out to be harmless [45]. We present Portend, a tool that not only detects races but also automatically classifies them based on their potential consequences: Could they lead to crashes or hangs? Could their effects be visible outside the program? Are they harmless? Our proposed technique achieves high accuracy by efficiently analyzing multiple paths and multiple thread schedules in combination, and by performing symbolic comparison between program outputs. We ran Portend on 7 real-world applications: it detected 93 true data races and correctly classified 92 of them, with no human effort. 6 of them are harmful races. Portend's classification accuracy is up to 88\% higher than that of existing tools, and it produces easy-to-understand evidence of the consequences of harmful races, thus both proving their harmfulness and making debugging easier. We envision Portend being used for testing and debugging, as well as for automatically triaging bug reports.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Clements:2012:SAS, author = "Austin T. Clements and M. Frans Kaashoek and Nickolai Zeldovich", title = "Scalable address spaces using {RCU} balanced trees", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "199--210", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150998", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Software developers commonly exploit multicore processors by building multithreaded software in which all threads of an application share a single address space. This shared address space has a cost: kernel virtual memory operations such as handling soft page faults, growing the address space, mapping files, etc. can limit the scalability of these applications. In widely-used operating systems, all of these operations are synchronized by a single per-process lock. This paper contributes a new design for increasing the concurrency of kernel operations on a shared address space by exploiting read-copy-update (RCU) so that soft page faults can both run in parallel with operations that mutate the same address space and avoid contending with other page faults on shared cache lines. To enable such parallelism, this paper also introduces an RCU-based binary balanced tree for storing memory mappings. An experimental evaluation using three multithreaded applications shows performance improvements on 80 cores ranging from 1.7x to 3.4x for an implementation of this design in the Linux 2.6.37 kernel. The RCU-based binary tree enables soft page faults to run at a constant cost with an increasing number of cores,suggesting that the design will scale well beyond 80 cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Volos:2012:ATM, author = "Haris Volos and Andres Jaan Tack and Michael M. Swift and Shan Lu", title = "Applying transactional memory to concurrency bugs", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "211--222", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2150999", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Multithreaded programs often suffer from synchronization bugs such as atomicity violations and deadlocks. These bugs arise from complicated locking strategies and ad hoc synchronization methods to avoid the use of locks. A survey of the bug databases of major open-source applications shows that concurrency bugs often take multiple fix attempts, and that fixes often introduce yet more concurrency bugs. Transactional memory (TM) enables programmers to declare regions of code atomic without specifying a lock and has the potential to avoid these bugs. Where most previous studies have focused on using TM to write new programs from scratch, we consider its utility in fixing existing programs with concurrency bugs. We therefore investigate four methods of using TM on three concurrent programs. Overall, we find that 29\% of the bugs are not fixable by transactional memory, showing that TM does not address many important types of concurrency bugs. In particular, TM works poorly with extremely long critical sections and with deadlocks involving both condition variables and I/O. Conversely, we find that for 56\% of the bugs, transactional memory offers demonstrable value by simplifying the reasoning behind a fix or the effort to implement a fix, and using transactions in the first place would have avoided 71\% of the bugs examined. We also find that ad hoc synchronization put in place to avoid the overhead of locking can be greatly simplified with TM, but requires hardware support to perform well.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Joao:2012:BIS, author = "Jos{\'e} A. Joao and M. Aater Suleman and Onur Mutlu and Yale N. Patt", title = "Bottleneck identification and scheduling in multithreaded applications", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "223--234", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151001", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Performance of multithreaded applications is limited by a variety of bottlenecks, e.g. critical sections, barriers and slow pipeline stages. These bottlenecks serialize execution, waste valuable execution cycles, and limit scalability of applications. This paper proposes Bottleneck Identification and Scheduling in Multithreaded Applications (BIS), a cooperative software-hardware mechanism to identify and accelerate the most critical bottlenecks. BIS identifies which bottlenecks are likely to reduce performance by measuring the number of cycles threads have to wait for each bottleneck, and accelerates those bottlenecks using one or more fast cores on an Asymmetric Chip Multi-Processor (ACMP). Unlike previous work that targets specific bottlenecks, BIS can identify and accelerate bottlenecks regardless of their type. We compare BIS to four previous approaches and show that it outperforms the best of them by 15\% on average. BIS' performance improvement increases as the number of cores and the number of fast cores in the system increase.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Radojkovic:2012:OTA, author = "Petar Radojkovi{\'c} and Vladimir Cakarevi{\'c} and Miquel Moret{\'o} and Javier Verd{\'u} and Alex Pajuelo and Francisco J. Cazorla and Mario Nemirovsky and Mateo Valero", title = "Optimal task assignment in multithreaded processors: a statistical approach", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "235--248", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151002", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "The introduction of massively multithreaded (MMT) processors, comprised of a large number of cores with many shared resources, has made task scheduling, in particular task to hardware thread assignment, one of the most promising ways to improve system performance. However, finding an optimal task assignment for a workload running on MMT processors is an NP-complete problem. Due to the fact that the performance of the best possible task assignment is unknown, the room for improvement of current task-assignment algorithms cannot be determined. This is a major problem for the industry because it could lead to: (1)~A waste of resources if excessive effort is devoted to improving a task assignment algorithm that already provides a performance that is close to the optimal one, or (2)~significant performance loss if insufficient effort is devoted to improving poorly-performing task assignment algorithms. In this paper, we present a method based on Extreme Value Theory that allows the prediction of the performance of the optimal task assignment in MMT processors. We further show that executing a sample of several hundred or several thousand random task assignments is enough to obtain, with very high confidence, an assignment with a performance that is close to the optimal one. We validate our method with an industrial case study for a set of multithreaded network applications running on an UltraSPARC~T2 processor.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jaleel:2012:CCR, author = "Aamer Jaleel and Hashem H. Najaf-abadi and Samantika Subramaniam and Simon C. Steely and Joel Emer", title = "{CRUISE}: cache replacement and utility-aware scheduling", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "249--260", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151003", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "When several applications are co-scheduled to run on a system with multiple shared LLCs, there is opportunity to improve system performance. This opportunity can be exploited by the hardware, software, or a combination of both hardware and software. The software, i.e., an operating system or hypervisor, can improve system performance by co-scheduling jobs on LLCs to minimize shared cache contention. The hardware can improve system throughput through better replacement policies by allocating more cache resources to applications that benefit from the cache and less to those applications that do not. This study presents a detailed analysis on the interactions between intelligent scheduling and smart cache replacement policies. We find that smart cache replacement reduces the burden on software to provide intelligent scheduling decisions. However, under smart cache replacement, there is still room to improve performance from better application co-scheduling. We find that co-scheduling decisions are a function of the underlying LLC replacement policy. We propose Cache Replacement and Utility-aware Scheduling (CRUISE)-a hardware/software co-designed approach for shared cache management. For 4-core and 8-core CMPs, we find that CRUISE approaches the performance of an ideal job co-scheduling policy under different LLC replacement policies.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{DeVuyst:2012:EMH, author = "Matthew DeVuyst and Ashish Venkat and Dean M. Tullsen", title = "Execution migration in a heterogeneous-{ISA} chip multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "261--272", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151004", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Prior research has shown that single-ISA heterogeneous chip multiprocessors have the potential for greater performance and energy efficiency than homogeneous CMPs. However, restricting the cores to a single ISA removes an important opportunity for greater heterogeneity. To take full advantage of a heterogeneous-ISA CMP, however, we must be able to migrate execution among heterogeneous cores in order to adapt to program phase changes and changing external conditions (e.g., system power state). This paper explores migration on heterogeneous-ISA CMPs. This is non-trivial because program state is kept in an architecture-specific form; therefore, state transformation is necessary for migration. To keep migration cost low, the amount of state that requires transformation must be minimized. This work identifies large portions of program state whose form is not critical for performance; the compiler is modified to produce programs that keep most of their state in an architecture-neutral form so that only a small number of data items must be repositioned and no pointers need to be changed. The result is low migration cost with minimal sacrifice of non-migration performance. Additionally, this work leverages binary translation to enable instantaneous migration. When migration is requested, the program is immediately migrated to a different core where binary translation runs for a short time until a function call is reached, at which point program state is transformed and execution continues natively on the new core. This system can tolerate migrations as often as every 100 ms and still retain 95\% of the performance of a system that does not do, or support, migration.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lin:2012:ESC, author = "Changhui Lin and Vijay Nagarajan and Rajiv Gupta and Bharghava Rajaram", title = "Efficient sequential consistency via conflict ordering", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "273--286", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151006", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Although the sequential consistency (SC) model is the most intuitive, processor designers often choose to support relaxed memory consistency models for higher performance. This is because SC implementations that match the performance of relaxed memory models require post-retirement speculation and its associated hardware costs. In this paper we propose an efficient approach for enforcing SC without requiring post-retirement speculation. While prior SC implementations guarantee SC by explicitly completing memory operations within a processor in program order, we guarantee SC by completing conflicting memory operations, within and across processors, in an order that is consistent with the program order. More specifically, we identify those conflicting memory operations whose ordering is critical for the maintenance of SC and explicitly order them. This allows us to safely (non-speculatively) complete memory operations past pending writes, thus reducing memory ordering stalls. Our experiments with SPLASH-2 programs show that SC can be achieved efficiently, with performance comparable to RMO (relaxed memory order).", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cheriton:2012:HAS, author = "David Cheriton and Amin Firoozshahian and Alex Solomatnikov and John P. Stevenson and Omid Azizi", title = "{HICAMP}: architectural support for efficient concurrency-safe shared structured data access", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "287--300", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151007", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Programming language and operating system support for efficient concurrency-safe access to shared data is a key concern for the effective use of multi-core processors. Most research has focused on the software model of multiple threads accessing this data within a single shared address space. However, many real applications are actually structured as multiple separate processes for fault isolation and simplified synchronization. In this paper, we describe the HICAMP architecture and its innovative memory system, which supports efficient concurrency safe access to structured shared data without incurring the overhead of inter-process communication. The HICAMP architecture also provides support for programming language and OS structures such as threads, iterators, read-only access and atomic update. In addition to demonstrating that HICAMP is beneficial for multi-process structured applications, our evaluation shows that the same mechanisms provide substantial benefits for other areas, including sparse matrix computations and virtualization.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Esmaeilzadeh:2012:ASD, author = "Hadi Esmaeilzadeh and Adrian Sampson and Luis Ceze and Doug Burger", title = "Architecture support for disciplined approximate programming", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "301--312", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151008", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Disciplined approximate programming lets programmers declare which parts of a program can be computed approximately and consequently at a lower energy cost. The compiler proves statically that all approximate computation is properly isolated from precise computation. The hardware is then free to selectively apply approximate storage and approximate computation with no need to perform dynamic correctness checks. In this paper, we propose an efficient mapping of disciplined approximate programming onto hardware. We describe an ISA extension that provides approximate operations and storage, which give the hardware freedom to save energy at the cost of accuracy. We then propose Truffle, a microarchitecture design that efficiently supports the ISA extensions. The basis of our design is dual-voltage operation, with a high voltage for precise operations and a low voltage for approximate operations. The key aspect of the microarchitecture is its dependence on the instruction stream to determine when to use the low voltage. We evaluate the power savings potential of in-order and out-of-order Truffle configurations and explore the resulting quality of service degradation. We evaluate several applications and demonstrate energy savings up to 43\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Meisner:2012:DAS, author = "David Meisner and Thomas F. Wenisch", title = "{DreamWeaver}: architectural support for deep sleep", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "313--324", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151009", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Numerous data center services exhibit low average utilization leading to poor energy efficiency. Although CPU voltage and frequency scaling historically has been an effective means to scale down power with utilization, transistor scaling trends are limiting its effectiveness and the CPU is accounting for a shrinking fraction of system power. Recent research advocates the use of full-system idle low-power modes to combat energy losses, as such modes provide the deepest power savings with bounded response time impact. However, the trend towards increasing cores per die is undermining the effectiveness of these sleep modes, particularly for request-parallel data center applications, because the independent idle periods across individual cores are unlikely to align by happenstance. We propose DreamWeaver, architectural support to facilitate deep sleep for request-parallel applications on multicore servers. DreamWeaver comprises two elements: Weave Scheduling, a scheduling policy to coalesce idle and busy periods across cores to create opportunities for system-wide deep sleep; and the Dream Processor, a light-weight co-processor that monitors incoming network traffic and suspended work during sleep to determine when the system must wake. DreamWeaver is based on two key concepts: (1) stall execution and sleep anytime any core is unoccupied, but (2) constrain the maximum time any request may be stalled. Unlike prior scheduling approaches, DreamWeaver will preempt execution to sleep, maximizing time spent at the systems' most efficient operating point. We demonstrate that DreamWeaver can smoothly trade-off bounded, predictable increases in 99th-percentile response time for increasing power savings, and strictly dominates the savings available with voltage and frequency scaling and timeout-based request batching schemes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{King:2012:AGH, author = "Myron King and Nirav Dave and Arvind", title = "Automatic generation of hardware\slash software interfaces", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "325--336", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151011", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Enabling new applications for mobile devices often requires the use of specialized hardware to reduce power consumption. Because of time-to-market pressure, current design methodologies for embedded applications require an early partitioning of the design, allowing the hardware and software to be developed simultaneously, each adhering to a rigid interface contract. This approach is problematic for two reasons: (1) a detailed hardware-software interface is difficult to specify until one is deep into the design process, and (2) it prevents the later migration of functionality across the interface motivated by efficiency concerns or the addition of features. We address this problem using the Bluespec Codesign Language~(BCL) which permits the designer to specify the hardware-software partition in the source code, allowing the compiler to synthesize efficient software and hardware along with transactors for communication between the partitions. The movement of functionality across the hardware-software boundary is accomplished by simply specifying a new partitioning, and since the compiler automatically generates the desired interface specifications, it eliminates yet another error-prone design task. In this paper we present BCL, an extension of a commercially available hardware design language (Bluespec SystemVerilog), a new software compiling scheme, and preliminary results generated using our compiler for various hardware-software decompositions of an Ogg Vorbis audio decoder, and a ray-tracing application.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Martignoni:2012:PEL, author = "Lorenzo Martignoni and Stephen McCamant and Pongsin Poosankam and Dawn Song and Petros Maniatis", title = "Path-exploration lifting: hi-fi tests for lo-fi emulators", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "337--348", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151012", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Processor emulators are widely used to provide isolation and instrumentation of binary software. However they have proved difficult to implement correctly: processor specifications have many corner cases that are not exercised by common workloads. It is untenable to base other system security properties on the correctness of emulators that have received only ad-hoc testing. To obtain emulators that are worthy of the required trust, we propose a technique to explore a high-fidelity emulator with symbolic execution, and then lift those test cases to test a lower-fidelity emulator. The high-fidelity emulator serves as a proxy for the hardware specification, but we can also further validate by running the tests on real hardware. We implement our approach and apply it to generate about 610,000 test cases; for about 95\% of the instructions we achieve complete path coverage. The tests reveal thousands of individual differences; we analyze those differences to shed light on a number of root causes, such as atomicity violations and missing security features.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hong:2012:GMD, author = "Sungpack Hong and Hassan Chafi and Edic Sedlar and Kunle Olukotun", title = "{Green-Marl}: a {DSL} for easy and efficient graph analysis", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "349--362", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151013", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "The increasing importance of graph-data based applications is fueling the need for highly efficient and parallel implementations of graph analysis software. In this paper we describe Green-Marl, a domain-specific language (DSL) whose high level language constructs allow developers to describe their graph analysis algorithms intuitively, but expose the data-level parallelism inherent in the algorithms. We also present our Green-Marl compiler which translates high-level algorithmic description written in Green-Marl into an efficient C++ implementation by exploiting this exposed data-level parallelism. Furthermore, our Green-Marl compiler applies a set of optimizations that take advantage of the high-level semantic knowledge encoded in the Green-Marl DSL. We demonstrate that graph analysis algorithms can be written very intuitively with Green-Marl through some examples, and our experimental results show that the compiler-generated implementation out of such descriptions performs as well as or better than highly-tuned hand-coded implementations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Park:2012:SDE, author = "Yongjun Park and Sangwon Seo and Hyunchul Park and Hyoun Kyu Cho and Scott Mahlke", title = "{SIMD} defragmenter: efficient {ILP} realization on data-parallel architectures", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "363--374", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151014", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Single-instruction multiple-data (SIMD) accelerators provide an energy-efficient platform to scale the performance of mobile systems while still retaining post-programmability. The central challenge is translating the parallel resources of the SIMD hardware into real application performance. In scientific applications, automatic vectorization techniques have proven quite effective at extracting large levels of data-level parallelism (DLP). However, vectorization is often much less effective for media applications due to low trip count loops, complex control flow, and non-uniform execution behavior. As a result, SIMD lanes remain idle due to insufficient DLP. To attack this problem, this paper proposes a new vectorization pass called SIMD Defragmenter to uncover hidden DLP that lurks below the surface in the form of instruction-level parallelism (ILP). The difficulty is managing the data packing/unpacking overhead that can easily exceed the benefits gained through SIMD execution. The SIMD degragmenter overcomes this problem by identifying groups of compatible instructions (subgraphs) that can be executed in parallel across the SIMD lanes. By SIMDizing in bulk at the subgraph level, packing/unpacking overhead is minimized. On a 16-lane SIMD processor, experimental results show that SIMD defragmentation achieves a mean 1.6x speedup over traditional loop vectorization and a 31\% gain over prior research approaches for converting ILP to DLP.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Simha:2012:UAS, author = "Dilip Nijagal Simha and Maohua Lu and Tzi-cker Chiueh", title = "An update-aware storage system for low-locality update-intensive workloads", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "375--386", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151016", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Traditional storage systems provide a simple read/write interface, which is inadequate for low-locality update-intensive workloads because it limits the disk scheduling flexibility and results in inefficient use of buffer memory and raw disk bandwidth. This paper describes an update-aware disk access interface that allows applications to explicitly specify disk update requests and associate with such requests call-back functions that will be invoked when the requested disk blocks are brought into memory. Because call-back functions offer a continuation mechanism after retrieval of requested blocks, storage systems supporting this interface are given more flexibility in scheduling pending disk update requests. In particular, this interface enables a simple but effective technique called Batching mOdifications with Sequential Commit (BOSC), which greatly improves the sustained throughput of a storage system under low-locality update-intensive workloads. In addition, together with a space-efficient low-latency disk logging technique, BOSC is able to deliver the same durability guarantee as synchronous disk updates. Empirical measurements show that the random update throughput of a BOSC-based B+ tree is more than an order of magnitude higher than that of the same B+ tree implementation on a traditional storage system.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Caulfield:2012:PSU, author = "Adrian M. Caulfield and Todor I. Mollov and Louis Alex Eisner and Arup De and Joel Coburn and Steven Swanson", title = "Providing safe, user space access to fast, solid state disks", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "387--400", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151017", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Emerging fast, non-volatile memories (e.g., phase change memories, spin-torque MRAMs, and the memristor) reduce storage access latencies by an order of magnitude compared to state-of-the-art flash-based SSDs. This improved performance means that software overheads that had little impact on the performance of flash-based systems can present serious bottlenecks in systems that incorporate these new technologies. We describe a novel storage hardware and software architecture that nearly eliminates two sources of this overhead: Entering the kernel and performing file system permission checks. The new architecture provides a private, virtualized interface for each process and moves file system protection checks into hardware. As a result, applications can access file data without operating system intervention, eliminating OS and file system costs entirely for most accesses. We describe the support the system provides for fast permission checks in hardware, our approach to notifying applications when requests complete, and the small, easily portable changes required in the file system to support the new access model. Existing applications require no modification to use the new interface. We evaluate the performance of the system using a suite of microbenchmarks and database workloads and show that the new interface improves latency and bandwidth for 4 KB writes by 60\% and 7.2x, respectively, OLTP database transaction throughput by up to 2.0x, and Berkeley-DB throughput by up to 5.7x. A streamlined asynchronous file IO interface built to fully utilize the new interface enables an additional 5.5x increase in throughput with 1 thread and 2.8x increase in efficiency for 512 B transfers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Narayanan:2012:WSP, author = "Dushyanth Narayanan and Orion Hodson", title = "Whole-system persistence", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "401--410", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151018", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Today's databases and key-value stores commonly keep all their data in main memory. A single server can have over 100 GB of memory, and a cluster of such servers can have 10s to 100s of TB. However, a storage back end is still required for recovery from failures. Recovery can last for minutes for a single server or hours for a whole cluster, causing heavy load on the back end. Non-volatile main memory (NVRAM) technologies can help by allowing near-instantaneous recovery of in-memory state. However, today's software does not support this well. Block-based approaches such as persistent buffer caches suffer from data duplication and block transfer overheads. Recently, user-level persistent heaps have been shown to have much better performance than these. However they require substantial application modification and still have significant runtime overheads. This paper proposes whole-system persistence (WSP) as an alternative. WSP is aimed at systems where all memory is non-volatile. It transparently recovers an application's entire state, making a failure appear as a suspend/resume event. Runtime overheads are eliminated by using ``flush on fail'': transient state in processor registers and caches is flushed to NVRAM only on failure, using the residual energy from the system power supply. Our evaluation shows that this approach has 1.6--13 times better runtime performance than a persistent heap, and that flush-on-fail can complete safely within 2--35\\% of the residual energy window provided by standard power supplies.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gordon:2012:EBM, author = "Abel Gordon and Nadav Amit and Nadav Har'El and Muli Ben-Yehuda and Alex Landau and Assaf Schuster and Dan Tsafrir", title = "{ELI}: bare-metal performance for {I/O} virtualization", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "411--422", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151020", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Direct device assignment enhances the performance of guest virtual machines by allowing them to communicate with I/O devices without host involvement. But even with device assignment, guests are still unable to approach bare-metal performance, because the host intercepts all interrupts, including those interrupts generated by assigned devices to signal to guests the completion of their I/O requests. The host involvement induces multiple unwarranted guest/host context switches, which significantly hamper the performance of I/O intensive workloads. To solve this problem, we present ELI (ExitLess Interrupts), a software-only approach for handling interrupts within guest virtual machines directly and securely. By removing the host from the interrupt handling path, ELI manages to improve the throughput and latency of unmodified, untrusted guests by 1.3x-1.6x, allowing them to reach 97\%-100\% of bare-metal performance even for the most demanding I/O-intensive workloads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vasic:2012:DAR, author = "Nedeljko Vasi{\'c} and Dejan Novakovi{\'c} and Svetozar Miucin and Dejan Kosti{\'c} and Ricardo Bianchini", title = "{DejaVu}: accelerating resource allocation in virtualized environments", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "423--436", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151021", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Effective resource management of virtualized environments is a challenging task. State-of-the-art management systems either rely on analytical models or evaluate resource allocations by running actual experiments. However, both approaches incur a significant overhead once the workload changes. The former needs to re-calibrate and re-validate models, whereas the latter has to run a new set of experiments to select a new resource allocation. During the adaptation period, the system may run with an inefficient configuration. In this paper, we propose DejaVu --- a framework that (1) minimizes the resource management overhead by identifying a small set of workload classes for which it needs to evaluate resource allocation decisions, (2) quickly adapts to workload changes by classifying workloads using signatures and caching their preferred resource allocations at runtime, and (3) deals with interference by estimating an ``interference index''. We evaluate DejaVu by running representative network services on Amazon EC2. DejaVu achieves more than 10x speedup in adaptation time for each workload change relative to the state-of-the-art. By enabling quick adaptation, DejaVu saves up to 60\% of the service provisioning cost. Finally, DejaVu is easily deployable as it does not require any extensive instrumentation or human intervention.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Szefer:2012:ASH, author = "Jakub Szefer and Ruby B. Lee", title = "Architectural support for hypervisor-secure virtualization", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "437--450", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151022", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Virtualization has become a standard part of many computer systems. A key part of virtualization is the all-powerful hypervisor which manages the physical platform and can access all of its resources, including memory assigned to the guest virtual machines (VMs). Continuing releases of bug reports and exploits in the virtualization software show that defending the hypervisor against attacks is very difficult. In this work, we present hypervisor-secure virtualization --- a new research direction with the goal of protecting the guest VMs from an untrusted hypervisor. We also present the HyperWall architecture which achieves hypervisor-secure virtualization, using hardware to provide the protections. HyperWall allows a hypervisor to freely manage the memory, processor cores and other resources of a platform. Yet once VMs are created, our new Confidentiality and Integrity Protection (CIP) tables protect the memory of the guest VMs from accesses by the hypervisor or by DMA, depending on the customer's specification. If a hypervisor does become compromised, e.g. by an attack from a malicious VM, it cannot be used in turn to attack other VMs. The protections are enabled through minimal modifications to the microprocessor and memory management units. Whereas much of the previous work concentrates on protecting the hypervisor from attacks by guest VMs, we tackle the problem of protecting the guest VMs from the hypervisor.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:2012:RSE, author = "Min Lee and Karsten Schwan", title = "Region scheduling: efficiently using the cache architectures via page-level affinity", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "451--462", month = mar, year = "2012", DOI = "https://doi.org/10.1145/2189750.2151023", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "The performance of modern many-core platforms strongly depends on the effectiveness of using their complex cache and memory structures. This indicates the need for a memory-centric approach to platform scheduling, in which it is the locations of memory blocks in caches rather than CPU idleness that determines where application processes are run. Using the term `memory region' to denote the current set of physical memory pages actively used by an application, this paper presents and evaluates region-based scheduling methods for multicore platforms. This involves (i) continuously and at runtime identifying the memory regions used by executable entities, and their sizes, (ii) mapping these regions to caches to match performance goals, and (iii) maintaining region to cache mappings by ensuring that entities run on processors with direct access to the caches containing their regions. Region scheduling can implement policies that (i) offer improved performance to applications by `unifying' the multiple caches present on the underlying physical machine and/or by `balancing' cache usage to take maximum advantage of available cache space, (ii) better isolate applications from each other, particularly when their performance is strongly affected by cache availability, and also (iii) take advantage of standard scheduling and CPU-based load balancing when regioning is ineffective. The paper describes region scheduling and its system-level implementation and evaluates its performance with micro-benchmarks and representative multi-core applications. Single applications see performance improvements of up to 15\% with region scheduling, and we observe 40\% latency improvements when a platform is shared by multiple applications. Superior isolation is shown to be particularly important for cache-sensitive or real-time codes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Juurlink:2012:ALP, author = "B. H. H. Juurlink and C. H. Meenderinck", title = "{Amdahl}'s law for predicting the future of multicores considered harmful", journal = j-COMP-ARCH-NEWS, volume = "40", number = "2", pages = "1--9", month = may, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2234336.2234338", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Jun 1 17:06:51 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Several recent works predict the future of multicore systems or identify scalability bottlenecks based on Amdahl's law. Amdahl's law implicitly assumes, however, that the problem size stays constant, but in most cases more cores are used to solve larger and more complex problems. There is a related law known as Gustafson's law which assumes that runtime, not the problem size, is constant. In other words, it is assumed that the runtime on p cores is the same as the runtime on 1 core and that the parallel part of an application scales linearly with the number of cores. We apply Gustafson's law to symmetric, asymmetric, and dynamic multicores and show that this leads to fundamentally different results than when Amdahl's law is applied. We also generalize Amdahl's and Gustafson's law and study how this quantitatively effects the dimensioning of future multicore systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mueller:2012:ABA, author = "Conrad Mueller", title = "Axiom based architecture", journal = j-COMP-ARCH-NEWS, volume = "40", number = "2", pages = "10--17", month = may, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2234336.2234339", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Jun 1 17:06:51 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The paper proposes an axiom based architecture as an alternative to the von Neumann model. The model has many desirable properties: fine-grained parallelism, simple semantics, better security and easy of programming. The empirical research gives some indication of its performance potential. A description is given as to how algebraic arithmetic expressions of relations can be broken up into primitive expressions consisting of a single operation. These primitive relations are shown to be sufficient to describe a Turing machine. Eight inference rules are given that define how the primitive relations can be evaluated. An outline is given of an architecture based on these inference rules. Finally a brief description is given of an experimental emulation and empirical evaluation of the architecture. Instead of manipulating data or values by applying instructions or functions, computation is applying existing elements to relations to create new elements. The element's identifier determines which relations the element applies to. The relation determines the identifier of the new element and the operation that needs to be applied to create the value of the new element. The conceptually indices are different in this model. Instead of seeing an index as an offset into an array, an index is seen as part of the element identifier. This enables infinitely many relations to be defined between unique sets using universal quantifiers. Thus every element, or value, computed has a unique description.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thomasian:2012:RPR, author = "Alexander Thomasian", title = "Rebuild processing in {RAID5} with emphasis on the supplementary parity augmentation method", journal = j-COMP-ARCH-NEWS, volume = "40", number = "2", pages = "18--27", month = may, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2234336.2234340", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Jun 1 17:06:51 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The rotated parity RAID5 disk array tolerates single disk failures by continuing operation by on-demand reconstruction of data blocks of the failed disk, until the systematic reconstruction of the contents of the failed disk is completed by the rebuild process on a spare disk. Supplementary Parity Augmentation (SPA), unlike the pyramid code, which has two parities covering half of the arrays disks each, extends RAID5's P parity with an additional S parity, which covers half of the disks. The extra load with respect to RAID5 of updating the S parity by one half of the disks is compensated by the more efficient on demand reconstruction and rebuild processing when a disk fails. Although SPA has the same disk space redundancy level as RAID6, unlike RAID6 it can only deal with roughly half of all possible double disk failure cases for eight disks. For rebuild processing SPA reads half of the disks required by RAID5 and this leads to a higher Mean Time to Data Loss than RAID5, since fewer Latent Sector Errors are encountered. We review performance and reliability modeling of RAID5 arrays to provide insights into SPA's performance and reliability, which cannot be gained from numerical results alone. SPA is outperformed by the Intra-Disk Redundancy schemes combined with RAID5, which results in RAID6's reliability and RAID5 performance.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Giri:2012:FIN, author = "Nishant Kumar Giri and Amitabha Sinha", title = "{FPGA} implementation of a novel architecture for performance enhancement of Radix-2 {FFT}", journal = j-COMP-ARCH-NEWS, volume = "40", number = "2", pages = "28--32", month = may, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2234336.2234341", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Jun 1 17:06:51 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents a novel architecture for the enhancement of performance of compute intensive Fast Fourier Transform (FFT) algorithm which is common in many signal processing applications. The proposed architecture exhibits faster response time compared to radix-2 `Single-path Delay Feedback (SDF)' architecture and `radix-2 Multi-path Delay Commutator (MDC)' architecture. The architecture was simulated using Modelsim and was implemented on Xilinx Virtex 4 FPGA.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ghosh:2012:NAF, author = "Aniruddha Ghosh and Satrughna Singha and Amitabha Sinha", title = "A new architecture for {FPGA} implementation of a {MAC} unit for digital signal processors using mixed number system", journal = j-COMP-ARCH-NEWS, volume = "40", number = "2", pages = "33--38", month = may, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2234336.2234342", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Jun 1 17:06:51 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Execution of arithmetic operations at very high speed in real time is the major concern in digital signal processing (DSP) because DSP algorithms are computation intensive. In recent times, Residue Number Systems (RNS) are considered as alternative to binary number system because of their capabilities of performing ``carry-free'' addition and Multiplication. Double Base Number Systems (DBNS), another non-binary number systems are also increasingly becoming attractive for signal processing applications due to their capabilities of handling arithmetic operations, particularly multiplication efficiently. However, the complexity involved in converting binary to DBNS becomes a major bottleneck and the efficiency of performance decreases considerably due to large conversion time. So RNS Adder and DBNS Multiplier can be used to implement multiply \& accumulate (MAC) units. Because RNS adders are less complex and faster compared to DBNS and DBNS multipliers are efficient compared to RNS multiplier. MAC units are the key units in Digital Signal Processors. In this paper we have shown how FIR filter can be implemented using the proposed ``Mixed Number System MAC units''.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ghosh:2012:FPR, author = "Aniruddha Ghosh and Satrughna Singha and Amitabha Sinha", title = "{``Floating point RNS''}: a new concept for designing the {MAC} unit of digital signal processor", journal = j-COMP-ARCH-NEWS, volume = "40", number = "2", pages = "39--43", month = may, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2234336.2234343", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Jun 1 17:06:51 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Execution of arithmetic operations at a very high speed in real time is the major concern in compute intensive digital signal processing (DSP) algorithms Residue Number Systems are being considered as alternative to binary number system because of their capabilities of performing ``carry free'' arithmetic operations. However, RNS systems have so far been used to handle integer numbers only. Floating Point RNS arithmetic units have obvious advantages over fixed point multiply \& accumulate (MAC) units which are the key units in Digital Signal Processors. Keeping this in view, in this paper, the architecture of a floating point MAC unit is presented.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2012:INa, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "40", number = "2", pages = "44--49", month = may, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2234336.2234345", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Jun 1 17:06:51 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Liu:2012:RRA, author = "Jamie Liu and Ben Jaiyen and Richard Veras and Onur Mutlu", title = "{RAIDR}: {Retention-Aware Intelligent DRAM Refresh}", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "1--12", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337161", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Dynamic random-access memory (DRAM) is the building block of modern main memory systems. DRAM cells must be periodically refreshed to prevent loss of data. These refresh operations waste energy and degrade system performance by interfering with memory accesses. The negative effects of DRAM refresh increase as DRAM device capacity increases. Existing DRAM devices refresh all cells at a rate determined by the leakiest cell in the device. However, most DRAM cells can retain data for significantly longer. Therefore, many of these refreshes are unnecessary. In this paper, we propose RAIDR (Retention-Aware Intelligent DRAM Refresh), a low-cost mechanism that can identify and skip unnecessary refreshes using knowledge of cell retention times. Our key idea is to group DRAM rows into retention time bins and apply a different refresh rate to each bin. As a result, rows containing leaky cells are refreshed as frequently as normal, while most rows are refreshed less frequently. RAIDR uses Bloom filters to efficiently implement retention time bins. RAIDR requires no modification to DRAM and minimal modification to the memory controller. In an 8-core system with 32 GB DRAM, RAIDR achieves a 74.6\% refresh reduction, an average DRAM power reduction of 16.1\%, and an average system performance improvement of 8.6\% over existing systems, at a modest storage overhead of 1.25 KB in the memory controller. RAIDR's benefits are robust to variation in DRAM system configuration, and increase as memory capacity increases.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bojnordi:2012:PPM, author = "Mahdi Nazm Bojnordi and Engin Ipek", title = "{PARDIS}: a programmable memory controller for the {DDRx} interfacing standards", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "13--24", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337162", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Modern memory controllers employ sophisticated address mapping, command scheduling, and power management optimizations to alleviate the adverse effects of DRAM timing and resource constraints on system performance. A promising way of improving the versatility and efficiency of these controllers is to make them programmable---a proven technique that has seen wide use in other control tasks ranging from DMA scheduling to NAND Flash and directory control. Unfortunately, the stringent latency and throughput requirements of modern DDRx devices have rendered such programmability largely impractical, confining DDRx controllers to fixed-function hardware. This paper presents the instruction set architecture (ISA) and hardware implementation of PARDIS, a programmable memory controller that can meet the performance requirements of a high-speed DDRx interface. The proposed controller is evaluated by mapping previously proposed DRAM scheduling, address mapping, refresh scheduling, and power management algorithms onto PARDIS. Simulation results show that the average performance of PARDIS comes within 8\% of fixed-function hardware for each of these techniques; moreover, by enabling application-specific optimizations, PARDIS improves system performance by 6--17\% and reduces DRAM energy by 9--22\% over four existing memory controllers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yoon:2012:BEM, author = "Doe Hyun Yoon and Jichuan Chang and Naveen Muralimanohar and Parthasarathy Ranganathan", title = "{BOOM}: enabling mobile memory based low-power server {DIMMs}", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "25--36", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337163", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "To address the real-time processing needs of large and growing amounts of data, modern software increasingly uses main memory as the primary data store for critical information. This trend creates a new emphasis on high-capacity, high-bandwidth, and high-reliability main memory systems. Conventional and recently-proposed server memory techniques can satisfy these requirements, but at the cost of significantly increased memory power, a key constraint for future memory systems. In this paper, we exploit the low-power nature of another high volume memory component---mobile DRAM---while improving its bandwidth and reliability shortcomings with a new DIMM architecture. We propose Buffered Output On Module (BOOM) that buffers the data outputs from multiple ranks of low-frequency mobile DRAM devices, which in aggregation provide high bandwidth and achieve chipkill-correct or even stronger reliability. Our evaluation shows that BOOM can reduce main memory power by more than 73\% relative to the baseline chipkill system, while improving average performance by 5\% and providing strong reliability. For memory-intensive applications, BOOM can improve performance by 30--40\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Malladi:2012:TEP, author = "Krishna T. Malladi and Benjamin C. Lee and Frank A. Nothaft and Christos Kozyrakis and Karthika Periyathambi and Mark Horowitz", title = "Towards energy-proportional datacenter memory with mobile {DRAM}", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "37--48", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337164", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "To increase datacenter energy efficiency, we need memory systems that keep pace with processor efficiency gains. Currently, servers use DDR3 memory, which is designed for high bandwidth but not for energy proportionality. A system using 20\% of the peak DDR3 bandwidth consumes 2.3x the energy per bit compared to the energy consumed by a system with fully utilized memory bandwidth. Nevertheless, many datacenter applications stress memory capacity and latency but not memory bandwidth. In response, we architect server memory systems using mobile DRAM devices, trading peak bandwidth for lower energy consumption per bit and more efficient idle modes. We demonstrate 3-5x lower memory power, better proportionality, and negligible performance penalties for datacenter workloads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Brunie:2012:SBW, author = "Nicolas Brunie and Sylvain Collange and Gregory Diamos", title = "Simultaneous branch and warp interweaving for sustained {GPU} performance", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "49--60", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337166", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Single-Instruction Multiple-Thread (SIMT) micro-architectures implemented in Graphics Processing Units (GPUs) run fine-grained threads in lockstep by grouping them into units, referred to as warps, to amortize the cost of instruction fetch, decode and control logic over multiple execution units. As individual threads take divergent execution paths, their processing takes place sequentially, defeating part of the efficiency advantage of SIMD execution. We present two complementary techniques that mitigate the impact of thread divergence on SIMT micro-architectures. Both techniques relax the SIMD execution model by allowing two distinct instructions to be scheduled to disjoint subsets of the the same row of execution units, instead of one single instruction. They increase flexibility by providing more thread grouping opportunities than SIMD, while preserving the affinity between threads to avoid introducing extra memory divergence. We consider (1) co-issuing instructions from different divergent paths of the same warp and (2) co-issuing instructions from different warps. To support (1), we introduce a novel thread reconvergence technique that ensures threads are run back in lockstep at control-flow reconvergence points without hindering their ability to run branches in parallel. We propose a lane shuffling technique to allow solution (2) to benefit from inter-warp correlations in divergence patterns. The combination of all these techniques improves performance by 23\% on a set of regular GPGPU applications and by 40\% on irregular applications, while maintaining the same instruction-fetch and processing-unit resource requirements as the contemporary Fermi GPU architecture.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rhu:2012:CPC, author = "Minsoo Rhu and Mattan Erez", title = "{CAPRI}: prediction of compaction-adequacy for handling control-divergence in {GPGPU} architectures", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "61--71", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337167", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Wide SIMD-based GPUs have evolved into a promising platform for running general purpose workloads. Current programmable GPUs allow even code with irregular control to execute well on their SIMD pipelines. To do this, each SIMD lane is considered to execute a logical thread where hardware ensures that control flow is accurate by automatically applying masked execution. The masked execution, however, often degrades performance because the issue slots of masked lanes are wasted. This degradation can be mitigated by dynamically compacting multiple unmasked threads into a single SIMD unit. This paper proposes a fundamentally new approach to branch compaction that avoids the unnecessary synchronization required by previous techniques and that only stalls threads that are likely to benefit from compaction. Our technique is based on the compaction-adequacy predictor (CAPRI). CAPRI dynamically identifies the compaction-effectiveness of a branch and only stalls threads that are predicted to benefit from compaction. We utilize a simple single-level branch-predictor inspired structure and show that this simple configuration attains a prediction accuracy of 99.8\% and 86.6\% for non-divergent and divergent workloads, respectively. Our performance evaluation demonstrates that CAPRI consistently outperforms both the baseline design that never attempts compaction and prior work that stalls upon all divergent branches.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Menon:2012:IES, author = "Jaikrishnan Menon and Marc {De Kruijf} and Karthikeyan Sankaralingam", title = "{iGPU}: exception support and speculative execution on {GPUs}", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "72--83", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337168", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Since the introduction of fully programmable vertex shader hardware, GPU computing has made tremendous advances. Exception support and speculative execution are the next steps to expand the scope and improve the usability of GPUs. However, traditional mechanisms to support exceptions and speculative execution are highly intrusive to GPU hardware design. This paper builds on two related insights to provide a unified lightweight mechanism for supporting exceptions and speculation on GPUs. First, we observe that GPU programs can be broken into code regions that contain little or no live register state at their entry point. We then also recognize that it is simple to generate these regions in such a way that they are idempotent, allowing their entry points to function as program recovery points and enabling support for exception handling, fast context switches, and speculation, all with very low overhead. We call the architecture of GPUs executing these idempotent regions the iGPU architecture. The hardware extensions required are minimal and the construction of idempotent code regions is fully transparent under the typical dynamic compilation framework of GPUs. We demonstrate how iGPU exception support enables virtual memory paging with very low overhead (1\% to 4\%), and how speculation support enables circuit-speculation techniques that can provide over 25\% reduction in energy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Arnau:2012:BMG, author = "Jos{\'e}-Mar{\'\i}a Arnau and Joan-Manuel Parcerisa and Polychronis Xekalakis", title = "Boosting mobile {GPU} performance with a decoupled access\slash execute fragment processor", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "84--93", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337169", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Smartphones represent one of the fastest growing markets, providing significant hardware/software improvements every few months. However, supporting these capabilities reduces the operating time per battery charge. The CPU/GPU component is only left with a shrinking fraction of the power budget, since most of the energy is consumed by the screen and the antenna. In this paper, we focus on improving the energy efficiency of the GPU since graphical applications consist an important part of the existing market. Moreover, the trend towards better screens will inevitably lead to a higher demand for improved graphics rendering. We show that the main bottleneck for these applications is the texture cache and that traditional techniques for hiding memory latency (prefetching, multithreading) do not work well or come at a high energy cost. We thus propose the migration of GPU designs towards the decoupled access-execute concept. Furthermore, we significantly reduce bandwidth usage in the decoupled architecture by exploiting inter-core data sharing. Using commercial Android applications, we show that the end design can achieve 93\% of the performance of a heavily multithreaded GPU while providing energy savings of 34\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kayaalp:2012:BRL, author = "Mehmet Kayaalp and Meltem Ozsoy and Nael Abu-Ghazaleh and Dmitry Ponomarev", title = "Branch regulation: low-overhead protection from code reuse attacks", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "94--105", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337171", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Code reuse attacks (CRAs) are recent security exploits that allow attackers to execute arbitrary code on a compromised machine. CRAs, exemplified by return-oriented and jump-oriented programming approaches, reuse fragments of the library code, thus avoiding the need for explicit injection of attack code on the stack. Since the executed code is reused existing code, CRAs bypass current hardware and software security measures that prevent execution from data or stack regions of memory. While software-based full control flow integrity (CFI) checking can protect against CRAs, it includes significant overhead, involves non-trivial effort of constructing a control flow graph, relies on proprietary tools and has potential vulnerabilities due to the presence of unintended branch instructions in architectures such as x86---those branches are not checked by the software CFI. We propose branch regulation (BR), a lightweight hardware-supported protection mechanism against the CRAs that addresses all limitations of software CFI. BR enforces simple control flow rules in hardware at the function granularity to disallow arbitrary control flow transfers from one function into the middle of another function. This prevents common classes of CRAs without the complexity and run-time overhead of full CFI enforcement. BR incurs a slowdown of about 2\% and increases the code footprint by less than 1\% on the average for the SPEC 2006 benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Demme:2012:SCV, author = "John Demme and Robert Martin and Adam Waksman and Simha Sethumadhavan", title = "Side-channel vulnerability factor: a metric for measuring information leakage", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "106--117", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337172", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "There have been many attacks that exploit side-effects of program execution to expose secret information and many proposed countermeasures to protect against these attacks. However there is currently no systematic, holistic methodology for understanding information leakage. As a result, it is not well known how design decisions affect information leakage or the vulnerability of systems to side-channel attacks. In this paper, we propose a metric for measuring information leakage called the Side-channel Vulnerability Factor (SVF). SVF is based on our observation that all side-channel attacks ranging from physical to microarchitectural to software rely on recognizing leaked execution patterns. SVF quantifies patterns in attackers' observations and measures their correlation to the victim's actual execution patterns and in doing so captures systems' vulnerability to side-channel attacks. In a detailed case study of on-chip memory systems, SVF measurements help expose unexpected vulnerabilities in whole-system designs and shows how designers can make performance-security trade-offs. Thus, SVF provides a quantitative approach to secure computer architecture.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Martin:2012:TRT, author = "Robert Martin and John Demme and Simha Sethumadhavan", title = "{TimeWarp}: rethinking timekeeping and performance monitoring mechanisms to mitigate side-channel attacks", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "118--129", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337173", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Over the past two decades, several microarchitectural side channels have been exploited to create sophisticated security attacks. Solutions to this problem have mainly focused on fixing the source of leaks either by limiting the flow of information through the side channel by modifying hardware, or by refactoring vulnerable software to protect sensitive data from leaking. These solutions are reactive and not preventative: while the modifications may protect against a single attack, they do nothing to prevent future side channel attacks that exploit other microarchitectural side channels or exploit the same side channel in a novel way. In this paper we present a general mitigation strategy that focuses on the infrastructure used to measure side channel leaks rather than the source of leaks, and thus applies to all known and unknown microarchitectural side channel leaks. Our approach is to limit the fidelity of fine grain timekeeping and performance counters, making it difficult for an attacker to distinguish between different microarchitectural events, thus thwarting attacks. We demonstrate the strength of our proposed security modifications, and validate that our changes do not break existing software. Our proposed changes require minor --- or in some cases, no --- hardware modifications and do not result in any substantial performance degradation, yet offer the most comprehensive protection against microarchitectural side channels to date.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Valamehr:2012:IRM, author = "Jonathan Valamehr and Melissa Chase and Seny Kamara and Andrew Putnam and Dan Shumow and Vinod Vaikuntanathan and Timothy Sherwood", title = "Inspection resistant memory: architectural support for security from physical examination", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "130--141", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337174", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "The ability to safely keep a secret in memory is central to the vast majority of security schemes, but storing and erasing these secrets is a difficult problem in the face of an attacker who can obtain unrestricted physical access to the underlying hardware. Depending on the memory technology, the very act of storing a 1 instead of a 0 can have physical side effects measurable even after the power has been cut. These effects cannot be hidden easily, and if the secret stored on chip is of sufficient value, an attacker may go to extraordinary means to learn even a few bits of that information. Solving this problem requires a new class of architectures that measurably increase the difficulty of physical analysis. In this paper we take a first step towards this goal by focusing on one of the backbones of any hardware system: on-chip memory. We examine the relationship between security, area, and efficiency in these architectures, and quantitatively examine the resulting systems through cryptographic analysis and microarchitectural impact. In the end, we are able to find an efficient scheme in which, even if an adversary is able to inspect the value of a stored bit with a probabilistic error of only 5\%, our system will be able to prevent that adversary from learning any information about the original un-coded bits with 99.9999999999\% probability.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Xu:2012:TPV, author = "Yi Xu and Jun Yang and Rami Melhem", title = "Tolerating process variations in nanophotonic on-chip networks", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "142--152", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337176", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Nanophontonic networks, a potential candidate for future networks on-chip, have been challenged for their reliability due to several device-level limitations. One of the main issues is that fabrication errors (a.k.a. process variations) can cause devices to malfunction, rendering communication unreliable. For example, microring resonator, a preferred optical modulator device, may not resonate at the designated wavelength under process variations (PV), leading to communication errors and bandwidth loss. This paper proposes a series of solutions to the wavelength drifting problem of microrings and subsequent bandwidth loss problem of an optical network, due to PV. The objective is to maximize network bandwidth through proper arrangement among microrings and wavelengths with minimum power requirement. Our arrangement, called ``MinTrim'', solves this problem using simple integer linear programming, adding supplementary microrings and allowing flexible assignment of wavelengths to network nodes as long as the resulting network presents maximal bandwidth. Each step is shown to improve bandwidth provisioning with lower power requirement. Evaluations on a sample network show that a baseline network could lose more than 40\% bandwidth due to PV. Such loss can be recovered by MinTrim to produce a network with 98.4\% working bandwidth. In addition, the power required in arranging microrings is 39\% lower than the baseline. Therefore, MinTrim provides an efficient PV-tolerant solution to improving the reliability of on-chip photonics.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Koka:2012:MAA, author = "Pranay Koka and Michael O. McCracken and Herb Schwetman and Chia-Hsin Owen Chen and Xuezhe Zheng and Ron Ho and Kannan Raj and Ashok V. Krishnamoorthy", title = "A micro-architectural analysis of switched photonic multi-chip interconnects", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "153--164", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337177", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Silicon photonics is a promising technology to scale offchip bandwidth in a power-efficient manner. Given equivalent bandwidth, the flexibility of switched networks often leads to the assumption that they deliver greater performance than point-to-point networks on message passing applications with low-radix traffic patterns. However, when optical losses are considered and total optical power is constrained, this assumption no longer holds. In this paper we present a power constrained method for designing photonic interconnects that uses the power characteristics and limits of optical switches, waveguide crossings, inter-layer couplers and waveguides. We apply this method to design three switched network topologies for a multi-chip system. Using synthetic and HPC benchmark-derived message patterns, we simulated the three switched networks and a WDM point-to-point network. We show that switched networks outperform point-to-point networks only when the optical losses of switches and inter-layer couplers losses are each 0.75 dB or lower; achieving this would require a major breakthrough in device development. We then show that this result extends to any switched network with similarly complex topology, through simulations of an idealized ``perfect'' network that supports 90\% of the peak bandwidth under all traffic patterns. We conclude that given a fixed amount of input optical power, under realistic device assumptions, a point-to-point network has the best performance and energy characteristics.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Carpenter:2012:EET, author = "Aaron Carpenter and Jianyun Hu and Ovunc Kocabas and Michael Huang and Hui Wu", title = "Enhancing effective throughput for transmission line-based bus", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "165--176", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337178", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Main-stream general-purpose microprocessors require a collection of high-performance interconnects to supply the necessary data movement. The trend of continued increase in core count has prompted designs of packet-switched network as a scalable solution for future-generation chips. However, the cost of scalability can be significant and especially hard to justify for smaller-scale chips. In contrast, a circuit-switched bus using transmission lines and corresponding circuits offers lower latencies and much lower energy costs for smaller-scale chips, making it a better choice than a full-blown network-on-chip (NoC) architecture. However, shared-medium designs are perceived as only a niche solution for small- to medium-scale chips. In this paper, we show that there are many low-cost mechanisms to enhance the effective throughput of a bus architecture. When a handful of highly cost-effective techniques are applied, the performance advantage of even the most idealistically configured NoCs becomes vanishingly small. We find transmission line-based buses to be a more compelling interconnect even for large-scale chip-multiprocessors, and thus bring into doubt the centrality of packet switching in future on-chip interconnect.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Koibuchi:2012:CRS, author = "Michihiro Koibuchi and Hiroki Matsutani and Hideharu Amano and D. Frank Hsu and Henri Casanova", title = "A case for random shortcut topologies for {HPC} interconnects", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "177--188", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337179", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "As the scales of parallel applications and platforms increase the negative impact of communication latencies on performance becomes large. Fortunately, modern High Performance Computing (HPC) systems can exploit low-latency topologies of high-radix switches. In this context, we propose the use of random shortcut topologies, which are generated by augmenting classical topologies with random links. Using graph analysis we find that these topologies, when compared to non-random topologies of the same degree, lead to drastically reduced diameter and average shortest path length. The best results are obtained when adding random links to a ring topology, meaning that good random shortcut topologies can easily be generated for arbitrary numbers of switches. Using flit-level discrete event simulation we find that random shortcut topologies achieve throughput comparable to and latency lower than that of existing non-random topologies such as hypercubes and tori. Finally, we discuss and quantify practical challenges for random shortcut topologies, including routing scalability and larger physical cable lengths.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nagarakatte:2012:WHS, author = "Santosh Nagarakatte and Milo M. K. Martin and Steve Zdancewic", title = "{Watchdog}: hardware for safe and secure manual memory management and full memory safety", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "189--200", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337181", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Languages such as C and C++ use unsafe manual memory management, allowing simple bugs (i.e., accesses to an object after deallocation) to become the root cause of exploitable security vulnerabilities. This paper proposes Watchdog, a hardware-based approach for ensuring safe and secure manual memory management. Inspired by prior software-only proposals, Watchdog generates a unique identifier for each memory allocation, associates these identifiers with pointers, and checks to ensure that the identifier is still valid on every memory access. This use of identifiers and checks enables Watchdog to detect errors even in the presence of reallocations. Watchdog stores these pointer identifiers in a disjoint shadow space to provide comprehensive protection and ensure compatibility with existing code. To streamline the implementation and reduce runtime overhead: Watchdog (1) uses micro-ops to access metadata and perform checks, (2) eliminates metadata copies among registers via modified register renaming, and (3) uses a dedicated metadata cache to reduce checking overhead. Furthermore, this paper extends Watchdog's mechanisms to detect bounds errors, thereby providing full hardware-enforced memory safety at low overheads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Devietti:2012:RAS, author = "Joseph Devietti and Benjamin P. Wood and Karin Strauss and Luis Ceze and Dan Grossman and Shaz Qadeer", title = "{RADISH}: always-on sound and complete {{\underline{Ra}ce \underline{D}etection \underline{i}n \underline{S}oftware and \underline{H}ardware}}", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "201--212", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337182", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Data-race freedom is a valuable safety property for multithreaded programs that helps with catching bugs, simplifying memory consistency model semantics, and verifying and enforcing both atomicity and determinism. Unfortunately, existing software-only dynamic race detectors are precise but slow; proposals with hardware support offer higher performance but are imprecise. Both precision and performance are necessary to achieve the many advantages always-on dynamic race detection could provide. To resolve this trade-off, we propose Radish, a hybrid hardware-software dynamic race detector that is always-on and fully precise. In Radish, hardware caches a principled subset of the metadata necessary for race detection; this subset allows the vast majority of race checks to occur completely in hardware. A flexible software layer handles persistence of race detection metadata on cache evictions and occasional queries to this expanded set of metadata. We show that Radish is correct by proving equivalence to a conventional happens-before race detector. Our design has modest hardware complexity: caches are completely unmodified and we piggy-back on existing coherence messages but do not otherwise modify the protocol. Furthermore, Radish can leverage type-safe languages to reduce overheads substantially. Our evaluation of a simulated 8-core Radish processor using PARSEC benchmarks shows runtime overheads from negligible to 2x, outperforming the leading software-only race detector by 2x-37x.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{VanCraeynest:2012:SHM, author = "Kenzo {Van Craeynest} and Aamer Jaleel and Lieven Eeckhout and Paolo Narvaez and Joel Emer", title = "Scheduling heterogeneous multi-cores through {Performance Impact Estimation (PIE)}", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "213--224", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337184", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Single-ISA heterogeneous multi-core processors are typically composed of small (e.g., in-order) power-efficient cores and big (e.g., out-of-order) high-performance cores. The effectiveness of heterogeneous multi-cores depends on how well a scheduler can map workloads onto the most appropriate core type. In general, small cores can achieve good performance if the workload inherently has high levels of ILP. On the other hand, big cores provide good performance if the workload exhibits high levels of MLP or requires the ILP to be extracted dynamically. This paper proposes Performance Impact Estimation (PIE) as a mechanism to predict which workload-to-core mapping is likely to provide the best performance. PIE collects CPI stack, MLP and ILP profile information, and estimates performance if the workload were to run on a different core type. Dynamic PIE adjusts the scheduling at runtime and thereby exploits fine-grained time-varying execution behavior. We show that PIE requires limited hardware support and can improve system performance by an average of 5.5\% over recent state-of-the-art scheduling proposals and by 8.7\% over a sampling-based scheduling policy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cao:2012:YYP, author = "Ting Cao and Stephen M. Blackburn and Tiejun Gao and Kathryn S. McKinley", title = "The yin and yang of power and performance for asymmetric hardware and managed software", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "225--236", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337185", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "On the hardware side, asymmetric multicore processors present software with the challenge and opportunity of optimizing in two dimensions: performance and power. Asymmetric multicore processors (AMP) combine general-purpose big (fast, high power) cores and small (slow, low power) cores to meet power constraints. Realizing their energy efficiency opportunity requires workloads with differentiated performance and power characteristics. On the software side, managed workloads written in languages such as C\#, Java, JavaScript, and PHP are ubiquitous. Managed languages abstract over hardware using Virtual Machine (VM) services (garbage collection, interpretation, and/or just-in-time compilation) that together impose substantial energy and performance costs, ranging from 10\% to over 80\%. We show that these services manifest a differentiated performance and power workload. To differing degrees, they are parallel, asynchronous, communicate infrequently, and are not on the application?s critical path. We identify a synergy between AMP and VM services that we exploit to attack the 40\% average energy overhead due to VM services. Using measurements and very conservative models, we show that adding small cores tailored for VM services should deliver, at least, improvements in performance of 13\%, energy of 7\%, and performance per energy of 22\%. The yin of VM services is overhead, but it meets the yang of small cores on an AMP. The yin of AMP is exposed hardware complexity, but it meets the yang of abstraction in managed languages. VM services fulfill the AMP requirement for an asynchronous, non-critical, differentiated, parallel, and ubiquitous workload to deliver energy efficiency. Generalizing this approach beyond system software to applications will require substantially more software and hardware investment, but these results show the potential energy efficiency gains are significant.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Krimer:2012:LDI, author = "Evgeni Krimer and Patrick Chiang and Mattan Erez", title = "Lane decoupling for improving the timing-error resiliency of wide-{SIMD} architectures", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "237--248", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337187", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "A significant portion of the energy dissipated in modern integrated circuits is consumed by the overhead associated with timing guardbands that ensure reliable execution. Timing speculation, where the pipeline operates at an unsafe voltage with any rare errors detected and resolved by the architecture, has been demonstrated to significantly improve the energy-efficiency of scalar processor designs. Unfortunately, applying the same timing-speculative approach to wide-SIMD architectures, such as those used in highly-efficient GPUs, may not provide similar gains. In this work, we make two important contributions. The first is a set of models describing a parametrized general error probability function that is based on measurements of a fabricated chip and the expected efficiency benefits of timing speculation in a SIMD context. The second contribution is a decoupled SIMD pipeline that more effectively utilizes timing speculation and recovery, when compared with a standard SIMD design that uses only conventional timing speculation. The proposed lane decoupling enables each SIMD lane to tolerate timing errors independent of other adjacent lanes, resulting in higher throughput and improved scalability. We validate our models and evaluate our design using a cycle-based GPU simulator, describe the conditions where efficiency improvements can be obtained, and explore the benefits of decoupling across a wide range of parameters. Our results show that timing speculation can achieve up to 10.3\% improvement in efficiency.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Miller:2012:VCE, author = "Timothy N. Miller and Renji Thomas and Xiang Pan and Radu Teodorescu", title = "{VRSync}: characterizing and eliminating synchronization-induced voltage emergencies in many-core processors", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "249--260", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337188", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Power consumption is a primary concern for microprocessor designers. Lowering the supply voltage of processors is one of the most effective techniques for improving their energy efficiency. Unfortunately, low-voltage operation faces multiple challenges going forward. One such challenge is increased sensitivity to voltage fluctuations, which can trigger so-called ``voltage emergencies'' that can lead to errors. These fluctuations are caused by abrupt changes in power demand, triggered by processor activity variation as a function of workload. This paper examines the effects of voltage fluctuations on future many-core processors. With the increase in the number of cores in a chip, the effects of chip-wide activity fluctuation --- such as that caused by global synchronization in multithreaded applications --- overshadow the effects of core-level workload variability. Starting from this observation, we developed VRSync, a novel synchronization methodology that uses emergency-aware scheduling policies that reduce the slope of load fluctuations, eliminating emergencies. We show that VRSync is very effective at eliminating emergencies, allowing voltage guardbands to be significantly lowered, which reduces energy consumption by an average of 33\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Doudalis:2012:EFU, author = "Ioannis Doudalis and Milos Prvulovic", title = "{Euripus}: a flexible unified hardware memory checkpointing accelerator for bidirectional-debugging and reliability", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "261--272", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337190", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Bidirectional debugging and error recovery have different goals (programmer productivity and system reliability, respectively), yet they both require the ability to roll-back the program or the system to a past state. This rollback functionality is typically implemented using checkpoints that can restore the system/application to a specific point in time. There are several types of checkpoints, and bidirectional debugging and error-recovery use them in different ways. This paper presents Euripus$^1$, a flexible hardware accelerator for memory checkpointing which can create different combinations of checkpoints needed for bidirectional debugging, error recovery, or both. In particular, Euripus is the first hardware technique to provide consolidation-friendly undo-logs (for bidirectional debugging), to allow simultaneous construction of both undo and redo logs, and to support multi-level checkpointing for the needs of error-recovery. Euripus incurs low performance overheads ({$<$5}\% on average), improves roll-back latency for bidirectional debugging by {$>$30}\%, and supports rapid multi-level error recovery that allows {$>$95}\% system efficiency even with very high error rates.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nair:2012:FOM, author = "Arun Arvind Nair and Stijn Eyerman and Lieven Eeckhout and Lizy Kurian John", title = "A first-order mechanistic model for architectural vulnerability factor", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "273--284", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337191", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Soft error reliability has become a first-order design criterion for modern microprocessors. Architectural Vulnerability Factor (AVF) modeling is often used to capture the probability that a radiation-induced fault in a hardware structure will manifest as an error at the program output. AVF estimation requires detailed microarchitectural simulations which are time-consuming and typically present aggregate metrics. Moreover, it requires a large number of simulations to derive insight into the impact of microarchitectural events on AVF. In this work we present a first-order mechanistic analytical model for computing AVF by estimating the occupancy of correct-path state in important microarchitecture structures through inexpensive profiling. We show that the model estimates the AVF for the reorder buffer, issue queue, load and store queue, and functional units in a 4-wide issue machine with a mean absolute error of less than 0.07. The model is constructed from the first principles of out-of-order processor execution in order to provide novel insight into the interaction of the workload with the microarchitecture to determine AVF. We demonstrate that the model can be used to perform design space explorations to understand trade-offs between soft error rate and performance, to study the impact of scaling of microarchitectural structures on AVF and performance, and to characterize workloads for AVF.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Udipi:2012:LEL, author = "Aniruddha N. Udipi and Naveen Muralimanohar and Rajeev Balsubramonian and Al Davis and Norman P. Jouppi", title = "{LOT-ECC}: localized and tiered reliability mechanisms for commodity memory systems", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "285--296", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337192", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Memory system reliability is a serious and growing concern in modern servers. Existing chipkill-level memory protection mechanisms suffer from several drawbacks. They activate a large number of chips on every memory access --- this increases energy consumption, and reduces performance due to the reduction in rank-level parallelism. Additionally, they increase access granularity, resulting in wasted bandwidth in the absence of sufficient access locality. They also restrict systems to use narrow-I/O x4 devices, which are known to be less energy-efficient than the wider x8 DRAM devices. In this paper, we present LOT-ECC, a localized and multi-tiered protection scheme that attempts to solve these problems. We separate error detection and error correction functionality, and employ simple checksum and parity codes effectively to provide strong fault-tolerance, while simultaneously simplifying implementation. Data and codes are localized to the same DRAM row to improve access efficiency. We use system firmware to store correction codes in DRAM data memory and modify the memory controller to handle data mapping. We thus build an effective fault-tolerance mechanism that provides strong reliability guarantees, activates as few chips as possible (reducing power consumption by up to 44.8\% and reducing latency by up to 46.9\%), and reduces circuit complexity, all while working with commodity DRAMs and operating systems. Finally, we propose the novel concept of a heterogeneous DIMM that enables the extension of LOT-ECC to x16 and wider DRAM parts.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Basu:2012:RMR, author = "Arkaprava Basu and Mark D. Hill and Michael M. Swift", title = "Reducing memory reference energy with opportunistic virtual caching", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "297--308", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337194", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Most modern cores perform a highly-associative transaction look aside buffer (TLB) lookup on every memory access. These designs often hide the TLB lookup latency by overlapping it with L1 cache access, but this overlap does not hide the power dissipated by TLB lookups. It can even exacerbate the power dissipation by requiring higher associativity L1 cache. With today's concern for power dissipation, designs could instead adopt a virtual L1 cache, wherein TLB access power is dissipated only after L1 cache misses. Unfortunately, virtual caches have compatibility issues, such as supporting writeable synonyms and x86's physical page table walker. This work proposes an Opportunistic Virtual Cache (OVC) that exposes virtual caching as a dynamic optimization by allowing some memory blocks to be cached with virtual addresses and others with physical addresses. OVC relies on small OS changes to signal which pages can use virtual caching (e.g., no writeable synonyms), but defaults to physical caching for compatibility. We show OVC's promise with analysis that finds virtual cache problems exist, but are dynamically rare. We change 240 lines in Linux 2.6.28 to enable OVC. On experiments with Parsec and commercial workloads, the resulting system saves 94-99\% of TLB lookup energy and nearly 23\% of L1 cache dynamic lookup energy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wang:2012:IWE, author = "Zhe Wang and Samira M. Khan and Daniel A. Jim{\'e}nez", title = "Improving writeback efficiency with decoupled last-write prediction", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "309--320", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337195", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "In modern DDRx memory systems, memory write requests compete with read requests for available memory resources, significantly increasing the average read request service time. Caches are used to mitigate long memory read latency that limits system performance. Dirty blocks in the last-level cache (LLC) that will not be written again before they are evicted will eventually be written back to memory. We refer to these blocks as last-write blocks. In this paper, we propose an LLC writeback technique that improves DRAM efficiency by scheduling predicted last-write blocks early. We propose a low overhead last-write predictor for the LLC. The predicted last-write blocks are made available to the memory controller for scheduling. This technique effectively re-distributes the memory requests and expands writes scheduling opportunities, allowing writes to be serviced efficiently by DRAM. The technique is flexible enough to be applied to any LLC replacement policy. Our evaluation with multi-programmed workloads shows that the technique significantly improves performance by 6.5\%-11.4\% on average over the traditional writeback technique in an eight-core processor with various DRAM configurations running memory intensive benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sim:2012:FBC, author = "Jaewoong Sim and Jaekyu Lee and Moinuddin K. Qureshi and Hyesoon Kim", title = "{FLEXclusion}: balancing cache capacity and on-chip bandwidth via flexible exclusion", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "321--332", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337196", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Exclusive last-level caches (LLCs) reduce memory accesses by effectively utilizing cache capacity. However, they require excessive on-chip bandwidth to support frequent insertions of cache lines on eviction from upper-level caches. Non-inclusive caches, on the other hand, have the advantage of using the on-chip bandwidth more effectively but suffer from a higher miss rate. Traditionally, the decision to use the cache as exclusive or non-inclusive is made at design time. However, the best option for a cache organization depends on application characteristics, such as working set size and the amount of traffic consumed by LLC insertions. This paper proposes FLEXclusion, a design that dynamically selects between exclusion and non-inclusion depending on workload behavior. With FLEXclusion, the cache behaves like an exclusive cache when the application benefits from extra cache capacity, and it acts as a non-inclusive cache when additional cache capacity is not useful, so that it can reduce on-chip bandwidth. FLEXclusion leverages the observation that both non-inclusion and exclusion rely on similar hardware support, so our proposal can be implemented with negligible hardware changes. Our evaluations show that a FLEXclusive cache reduces the on-chip LLC insertion traffic by 72.6\% compared to an exclusive design and improves performance by 5.9\% compared to a non-inclusive design.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Upasani:2012:SED, author = "Gaurang Upasani and Xavier Vera and Antonio Gonz{\'a}lez", title = "Setting an error detection infrastructure with low cost acoustic wave detectors", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "333--343", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337198", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "The continuing decrease in dimensions and operating voltage of transistors has increased their sensitivity against radiation phenomena making soft errors an important challenge in future chip multiprocessors (CMPs). Hence, new techniques for detecting errors in the logic and memories that allow meeting the desired failures-in-time (FIT) budget in CMPs are required. This paper proposes a low-cost dynamic particle strike detection mechanism through acoustic wave detectors. Our results show that our mechanism can protect both the logic and the memory arrays. As a case study, we also show how this technique can be combined with error codes to protect the last-level cache at low cost.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pellegrini:2012:VVP, author = "Andrea Pellegrini and Joseph L. Greathouse and Valeria Bertacco", title = "{Viper}: virtual pipelines for enhanced reliability", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "344--355", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337199", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "The reliability of future processors is threatened by decreasing transistor robustness. Current architectures focus on delivering high performance at low cost; lifetime device reliability is a secondary concern. As the rate of permanent hardware faults increases, robustness will become a first class constraint for even low-cost systems. Current research into reliable architectures has focused on ad-hoc solutions to improve designs without altering their centralized control logic. Unfortunately, this centralized control presents a single point of failure, which limits long-term robustness. To address this issue, we introduce Viper, an architecture built from a redundant collection of fine-grained hardware components. Instructions are perceived as customers that require a sequence of services in order to properly execute. The hardware components vie to perform what services they can, dynamically forming virtual pipelines that avoid defective hardware. This is done using distributed control logic, which avoids a single point of failure by construction. Viper can tolerate a high number of permanent faults due to its inherent redundancy. As fault counts increase, its performance degrades more gracefully than traditional centralized-logic architectures. We estimate that fault rates higher than one permanent faults per 12 million transistors, on average, cause the throughput of a classic CMP design to fall below that of a Viper design of similar size.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Temam:2012:DTA, author = "Olivier Temam", title = "A defect-tolerant accelerator for emerging high-performance applications", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "356--367", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337200", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Due to the evolution of technology constraints, especially energy constraints which may lead to heterogeneous multi-cores, and the increasing number of defects, the design of defect-tolerant accelerators for heterogeneous multi-cores may become a major micro-architecture research issue. Most custom circuits are highly defect sensitive, a single transistor can wreck such circuits. On the contrary, artificial neural networks (ANNs) are inherently error tolerant algorithms. And the emergence of high-performance applications implementing recognition and mining tasks, for which competitive ANN-based algorithms exist, drastically expands the potential application scope of a hardware ANN accelerator. However, while the error tolerance of ANN algorithms is well documented, there are few in-depth attempts at demonstrating that an actual hardware ANN would be tolerant to faulty transistors. Most fault models are abstract and cannot demonstrate that the error tolerance of ANN algorithms can be translated into the defect tolerance of hardware ANN accelerators. In this article, we introduce a hardware ANN geared towards defect tolerance and energy efficiency, by spatially expanding the ANN. In order to precisely assess the defect tolerance capability of this hardware ANN, we introduce defects at the level of transistors, and then assess the impact of such defects on the hardware ANN functional behavior. We empirically show that the conceptual error tolerance of neural networks does translate into the defect tolerance of hardware neural networks, paving the way for their introduction in heterogeneous multi-cores as intrinsically defect-tolerant and energy-efficient accelerators.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kim:2012:CES, author = "Yoongu Kim and Vivek Seshadri and Donghyuk Lee and Jamie Liu and Onur Mutlu", title = "A case for exploiting subarray-level parallelism {(SALP)} in {DRAM}", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "368--379", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337202", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Modern DRAMs have multiple banks to serve multiple memory requests in parallel. However, when two requests go to the same bank, they have to be served serially, exacerbating the high latency of off-chip memory. Adding more banks to the system to mitigate this problem incurs high system cost. Our goal in this work is to achieve the benefits of increasing the number of banks with a low cost approach. To this end, we propose three new mechanisms that overlap the latencies of different requests that go to the same bank. The key observation exploited by our mechanisms is that a modern DRAM bank is implemented as a collection of subarrays that operate largely independently while sharing few global peripheral structures. Our proposed mechanisms (SALP-1, SALP-2, and MASA) mitigate the negative impact of bank serialization by overlapping different components of the bank access latencies of multiple requests that go to different subarrays within the same bank. SALP-1 requires no changes to the existing DRAM structure and only needs reinterpretation of some DRAM timing parameters. SALP-2 and MASA require only modest changes ({$<$} 0.15\% area overhead) to the DRAM peripheral structures, which are much less design constrained than the DRAM core. Evaluations show that all our schemes significantly improve performance for both single-core systems and multi-core systems. Our schemes also interact positively with application-aware memory request scheduling in multi-core systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Qureshi:2012:PIP, author = "Moinuddin K. Qureshi and Michele M. Franceschini and Ashish Jagmohan and Luis A. Lastras", title = "{PreSET}: improving performance of phase change memories by exploiting asymmetry in write times", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "380--391", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337203", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Phase Change Memory (PCM) is a promising technology for building future main memory systems. A prominent characteristic of PCM is that it has write latency much higher than read latency. Servicing such slow writes causes significant contention for read requests. For our baseline PCM system, the slow writes increase the effective read latency by almost 2X, causing significant performance degradation. This paper alleviates the problem of slow writes by exploiting the fundamental property of PCM devices that writes are slow only in one direction (SET operation) and are almost as fast as reads in the other direction (RESET operation). Therefore, a write operation to a line in which all memory cells have been SET prior to the write, will incur much lower latency. We propose PreSET, an architectural technique that leverages this property to pro-actively SET all the bits in a given memory line well in advance of the anticipated write to that memory line. Our proposed design initiates a PreSET request for a memory line as soon as that line becomes dirty in the cache, thereby allowing a large window of time for the PreSET operation to complete. Our evaluations show that PreSET is more effective and incurs lower storage overhead than previously proposed write cancellation techniques. We also describe static and dynamic throttling schemes to limit the rate of PreSET operations. Our proposal reduces effective read latency from 982 cycles to 594 cycles and increases system performance by 34\%, while improving the energy-delay-product by 25\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cooper-Balis:2012:BBM, author = "Elliott Cooper-Balis and Paul Rosenfeld and Bruce Jacob", title = "Buffer-on-board memory systems", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "392--403", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337204", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "The design and implementation of the commodity memory architecture has resulted in significant performance and capacity limitations. To circumvent these limitations, designers and vendors have begun to place intermediate logic between the CPU and DRAM. This additional logic has two functions: to control the DRAM and to communicate with the CPU over a fast and narrow bus. The benefit provided by this logic is a reduction in pin-out to the memory system and increased signal integrity to the DRAM, allowing faster clock rates while maintaining capacity. While the few vendors utilizing this design have used the same general approach, their implementations vary greatly in their nontrivial details. A hardware-verified simulation suite is developed to accurately model and evaluate the behavior of this buffer-onboard memory system. A study of this design space is used to determine optimal use of the resources involved. This includes DRAM and bus organization, queue storage, and mapping schemes. Various constraints based on implementation costs are placed on simulated configurations to confirm that these optimizations apply to viable systems. Finally, full system simulations are performed to better understand how this memory system interacts with an operating system executing an application with the goal of uncovering behaviors not present in simple limit case simulations. When applying insights gleaned from these simulations, optimal performance can be achieved while still considering outside constraints (i.e., pin-out, power, and fabrication costs).", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jung:2012:PAQ, author = "Myoungsoo Jung and Ellis H. {Wilson III} and Mahmut Kandemir", title = "{Physically Addressed Queueing (PAQ)}: improving parallelism in solid state disks", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "404--415", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337206", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "NAND flash storage has proven to be a competitive alternative to traditional disk for its properties of high random-access speeds, low-power and its presumed efficacy for random-reads. Ironically, we demonstrate that when packaged in SSD format, there arise many barriers to reaching full parallelism in reads, resulting in random writes out-performing them. Motivated by this, we propose Physically Addressed Queuing (PAQ), a request scheduler that avoids resource contention resultant from shared SSD resources. PAQ makes the following major contributions: First, it exposes the physical addresses of requests to the scheduler. Second, I/O clumping is utilized to select groups of operations that can be simultaneously executed without major resource conflict. Third, inter-request NAND transaction packing empowers multi-plane-mode operations. We implement PAQ in a cycle-accurate simulator and demonstrate bandwidth and IOPS improvements greater than 62\% and latency decreases as much as 41.6\% for random reads, without degrading performance of other access types.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ausavarungnirun:2012:SMS, author = "Rachata Ausavarungnirun and Kevin Kai-Wei Chang and Lavanya Subramanian and Gabriel H. Loh and Onur Mutlu", title = "Staged memory scheduling: achieving high performance and scalability in heterogeneous systems", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "416--427", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337207", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "When multiple processor (CPU) cores and a GPU integrated together on the same chip share the off-chip main memory, requests from the GPU can heavily interfere with requests from the CPU cores, leading to low system performance and starvation of CPU cores. Unfortunately, state-of-the-art application-aware memory scheduling algorithms are ineffective at solving this problem at low complexity due to the large amount of GPU traffic. A large and costly request buffer is needed to provide these algorithms with enough visibility across the global request stream, requiring relatively complex hardware implementations. This paper proposes a fundamentally new approach that decouples the memory controller's three primary tasks into three significantly simpler structures that together improve system performance and fairness, especially in integrated CPU-GPU systems. Our three-stage memory controller first groups requests based on row-buffer locality. This grouping allows the second stage to focus only on inter-application request scheduling. These two stages enforce high-level policies regarding performance and fairness, and therefore the last stage consists of simple per-bank FIFO queues (no further command reordering within each bank) and straightforward logic that deals only with low-level DRAM commands and timing. We evaluate the design trade-offs involved in our Staged Memory Scheduler (SMS) and compare it against three state-of-the-art memory controller designs. Our evaluations show that SMS improves CPU performance without degrading GPU frame rate beyond a generally acceptable level, while being significantly less complex to implement than previous application-aware schedulers. Furthermore, SMS can be configured by the system software to prioritize the CPU or the GPU at varying levels to address different performance needs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Manikantan:2012:PSC, author = "R. Manikantan and Kaushik Rajan and R. Govindarajan", title = "{Probabilistic Shared Cache Management (PriSM)}", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "428--439", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337208", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Effective sharing of the last level cache has a significant influence on the overall performance of a multicore system. We observe that existing solutions control cache occupancy at a coarser granularity, do not scale well to large core counts and in some cases lack the flexibility to support a variety of performance goals. In this paper, we propose Probabilistic Shared Cache Management (PriSM), a framework to manage the cache occupancy of different cores at cache block granularity by controlling their eviction probabilities. The proposed framework requires only simple hardware changes to implement, can scale to larger core count and is flexible enough to support a variety of performance goals. We demonstrate the flexibility of PriSM, by computing the eviction probabilities needed to achieve goals like hit-maximization, fairness and QOS. PriSM-HitMax improves performance by 18.7\% over LRU and 11.8\% over previously proposed schemes in a sixteen core machine. PriSM-Fairness improves fairness over existing solutions by 23.3\% along with a performance improvement of 19.0\%. PriSM-QOS successfully achieves the desired QOS targets.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Satish:2012:CTP, author = "Nadathur Satish and Changkyu Kim and Jatin Chhugani and Hideki Saito and Rakesh Krishnaiyer and Mikhail Smelyanskiy and Milind Girkar and Pradeep Dubey", title = "Can traditional programming bridge the {Ninja} performance gap for parallel computing applications?", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "440--451", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337210", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Current processor trends of integrating more cores with wider SIMD units, along with a deeper and complex memory hierarchy, have made it increasingly more challenging to extract performance from applications. It is believed by some that traditional approaches to programming do not apply to these modern processors and hence radical new languages must be discovered. In this paper, we question this thinking and offer evidence in support of traditional programming methods and the performance-vs-programming effort effectiveness of common multi-core processors and upcoming many-core architectures in delivering significant speedup, and close-to-optimal performance for commonly used parallel computing workloads. We first quantify the extent of the ``Ninja gap'', which is the performance gap between naively written C/C++ code that is parallelism unaware (often serial) and best-optimized code on modern multi-/many-core processors. Using a set of representative throughput computing benchmarks, we show that there is an average Ninja gap of 24X (up to 53X ) for a recent 6-core Intel\reg{} CoreTM i7 X980 Westmere CPU, and that this gap if left unaddressed will inevitably increase. We show how a set of well-known algorithmic changes coupled with advancements in modern compiler technology can bring down the Ninja gap to an average of just 1.3X. These changes typically require low programming effort, as compared to the very high effort in producing Ninja code. We also discuss hardware support for programmability that can reduce the impact of these changes and even further increase programmer productivity. We show equally encouraging results for the upcoming Intel\reg{} Many Integrated Core architecture (Intel\reg{} MIC) which has more cores and wider SIMD. We thus demonstrate that we can contain the otherwise uncontrolled growth of the Ninja gap and offer a more stable and predictable performance growth over future architectures, offering strong evidence that radical language changes are not required.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kambadur:2012:HCA, author = "Melanie Kambadur and Kui Tang and Martha A. Kim", title = "{Harmony}: collection and analysis of parallel block vectors", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "452--463", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337211", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Efficient execution of well-parallelized applications is central to performance in the multicore era. Program analysis tools support the hardware and software sides of this effort by exposing relevant features of multithreaded applications. This paper describes parallel block vectors, which uncover previously unseen characteristics of parallel programs. Parallel block vectors provide block execution profiles per concurrency phase (e.g., the block execution profile of all serial regions of a program). This information provides a direct and fine-grained mapping between an application's runtime parallel phases and the static code that makes up those phases. This paper also demonstrates how to collect parallel block vectors with minimal application perturbation using Harmony. Harmony is an instrumentation pass for the LLVM compiler that introduces just 16-21\% overhead on average across eight Parsec benchmarks. We apply parallel block vectors to uncover several novel insights about parallel applications with direct consequences for architectural design. First, that the serial and parallel phases of execution used in Amdahl's Law are often composed of many of the same basic blocks. Second, that program features, such as instruction mix, vary based on the degree of parallelism, with serial phases in particular displaying different instruction mixes from the program as a whole. Third, that dynamic execution frequencies do not necessarily correlate with a block's parallelism.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wentzlaff:2012:CFG, author = "David Wentzlaff and Christopher J. Jackson and Patrick Griffin and Anant Agarwal", title = "Configurable fine-grain protection for multicore processor virtualization", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "464--475", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337213", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Multicore architectures, with their abundant on-chip resources, are effectively collections of systems-on-a-chip. The protection system for these architectures must support multiple concurrently executing operating systems (OSes) with different needs, and manage and protect the hardware's novel communication mechanisms and hardware features. Traditional protection systems are insufficient; they protect supervisor from user code, but typically do not protect one system from another, and only support fixed assignment of resources to protection levels. In this paper, we propose an alternative to traditional protection systems which we call configurable fine-grain protection (CFP). CFP enables the dynamic assignment of in-core resources to protection levels. We investigate how CFP enables different system software stacks to utilize the same configurable protection hardware, and how differing OSes can execute at the same time on a multicore processor with CFP. As illustration, we describe an implementation of CFP in a commercial multicore, the TILE64 processor.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ahn:2012:RHA, author = "Jeongseob Ahn and Seongwook Jin and Jaehyuk Huh", title = "Revisiting hardware-assisted page walks for virtualized systems", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "476--487", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337214", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Recent improvements in architectural supports for virtualization have extended traditional hardware page walkers to traverse nested page tables. However, current two-dimensional (2D) page walkers have been designed under the assumption that the usage patterns of guest and nested page tables are similar. In this paper, we revisit the architectural supports for nested page table walks to incorporate the unique characteristics of memory management by hypervisors. Unlike page tables in native systems, nested page table sizes do not impose significant overheads on the overall memory usage. Based on this observation, we propose to use flat nested page tables to reduce unnecessary memory references for nested walks. A competing mechanism to HW 2D page walkers is shadow paging, which duplicates guest page tables but provides direct translations from guest virtual to system physical addresses. However, shadow paging has been suffering from the overheads of synchronization between guest and shadow page tables. The second mechanism we propose is a speculative shadow paging mechanism, called speculative inverted shadow paging, which is backed by non-speculative flat nested page tables. The speculative mechanism provides a direct translation with a single memory reference for common cases, and eliminates the page table synchronization overheads. We evaluate the proposed schemes with the real Xen hypervisor running on a full system simulator. The flat page tables improve a state-of-the-art 2D page walker with a page walk cache and nested TLB by 7\%. The speculative shadow paging improves the same 2D page walker by 14\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kontorinis:2012:MDU, author = "Vasileios Kontorinis and Liuyi Eric Zhang and Baris Aksanli and Jack Sampson and Houman Homayoun and Eddie Pettis and Dean M. Tullsen and Tajana Simunic Rosing", title = "Managing distributed {UPS} energy for effective power capping in data centers", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "488--499", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337216", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Power over-subscription can reduce costs for modern data centers. However, designing the power infrastructure for a lower operating power point than the aggregated peak power of all servers requires dynamic techniques to avoid high peak power costs and, even worse, tripping circuit breakers. This work presents an architecture for distributed per-server UPSs that stores energy during low activity periods and uses this energy during power spikes. This work leverages the distributed nature of the UPS batteries and develops policies that prolong the duration of their usage. The specific approach shaves 19.4\% of the peak power for modern servers, at no cost in performance, allowing the installation of 24\% more servers within the same power budget. More servers amortize infrastructure costs better and, hence, reduce total cost of ownership per server by 6.3\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lotfi-Kamran:2012:SP, author = "Pejman Lotfi-Kamran and Boris Grot and Michael Ferdman and Stavros Volos and Onur Kocberber and Javier Picorel and Almutaz Adileh and Djordje Jevdjic and Sachin Idgunji and Emre Ozer and Babak Falsafi", title = "Scale-out processors", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "500--511", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337217", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Scale-out datacenters mandate high per-server throughput to get the maximum benefit from the large TCO investment. Emerging applications (e.g., data serving and web search) that run in these datacenters operate on vast datasets that are not accommodated by on-die caches of existing server chips. Large caches reduce the die area available for cores and lower performance through long access latency when instructions are fetched. Performance on scale-out workloads is maximized through a modestly-sized last-level cache that captures the instruction footprint at the lowest possible access latency. In this work, we introduce a methodology for designing scalable and efficient scale-out server processors. Based on a metric of performance-density, we facilitate the design of optimal multi-core configurations, called pods. Each pod is a complete server that tightly couples a number of cores to a small last-level cache using a fast interconnect. Replicating the pod to fill the die area yields processors which have optimal performance density, leading to maximum per-chip throughput. Moreover, as each pod is a stand-alone server, scale-out processors avoid the expense of global (i.e., inter-pod) interconnect and coherence. These features synergistically maximize throughput, lower design complexity, and improve technology scalability. In 20nm technology, scale-out chips improve throughput by 5x-6.5x over conventional and by 1.6x-1.9x over emerging tiled organizations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Li:2012:ICO, author = "Chao Li and Amer Qouneh and Tao Li", title = "{iSwitch}: coordinating and optimizing renewable energy powered server clusters", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "512--523", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337218", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Large-scale computing systems such as data centers are facing increasing pressure to cap their carbon footprint. Integrating emerging clean energy solutions into computer system design therefore gains great significance in the green computing era. While some pioneering work on tracking variable power budget show promising energy efficiency, they are not suitable for data centers due to lack of performance guarantee when renewable generation is low and fluctuant. In addition, our characterization of wind power behavior reveals that data centers designed to track the intermittent renewable power incur up to 4X performance loss due to inefficient and redundant load matching activities. As a result, mitigating operational overhead while still maintaining desired energy utilization becomes the most significant challenge in managing server clusters on intermittent renewable energy generation. In this paper we take a first step in digging into the operational overhead of renewable energy powered data center. We propose iSwitch, a lightweight server power management that follows renewable power variation characteristics, leverages existing system infrastructures, and applies supply/load cooperative scheme to mitigate the performance overhead. Comparing with state-of-the-art renewable energy driven system design, iSwitch could mitigate average network traffic by 75\%, peak network traffic by 95\%, and reduce 80\% job waiting time while still maintaining 96\% renewable energy utilization. We expect that our work can help computer architects make informed decisions on sustainable and high-performance system design.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Singh:2012:EES, author = "Abhayendra Singh and Satish Narayanasamy and Daniel Marino and Todd Millstein and Madanlal Musuvathi", title = "End-to-end sequential consistency", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "524--535", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337220", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Sequential consistency (SC) is arguably the most intuitive behavior for a shared-memory multithreaded program. It is widely accepted that language-level SC could significantly improve programmability of a multiprocessor system. However, efficiently supporting end-to-end SC remains a challenge as it requires that both compiler and hardware optimizations preserve SC semantics. While a recent study has shown that a compiler can preserve SC semantics for a small performance cost, an efficient and complexity-effective SC hardware remains elusive. Past hardware solutions relied on aggressive speculation techniques, which has not yet been realized in a practical implementation. This paper exploits the observation that hardware need not enforce any memory model constraints on accesses to thread-local and shared read-only locations. A processor can easily determine a large fraction of these safe accesses with assistance from static compiler analysis and the hardware memory management unit. We discuss a low-complexity hardware design that exploits this information to reduce the overhead in ensuring SC. Our design employs an additional unordered store buffer for fast-tracking thread-local stores and allowing later memory accesses to proceed without a memory ordering related stall. Our experimental study shows that the cost of guaranteeing end-to-end SC is only 6.2\% on average when compared to a system with TSO hardware executing a stock compiler's output.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mars:2012:BDS, author = "Jason Mars and Naveen Kumar", title = "{BlockChop}: dynamic squash elimination for hybrid processor architecture", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "536--547", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337221", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Hybrid processors are HW/SW co-designed processors that leverage blocked-execution, the execution of regions of instructions as atomic blocks, to facilitate aggressive speculative optimization. As we move to a multicore hybrid design, fine grained conflicts for shared data can violate the atomicity requirement of these blocks and lead to expensive squashes and rollbacks. However, as these atomic regions differ from those used in checkpointing and transactional memory systems, the extent of this potentially prohibitive problem remains unclear, and mechanisms to mitigate these squashes dynamically may be critical to enable a highly per-formant multicore hybrid design. In this work, we investigate how multithreaded applications, both benchmark and commercial workloads, are affected by squashes, and present dynamic mechanisms for mitigating these squashes in hybrid processors. While the current wisdom is that there is not a significant number of squashes for smaller atomic regions, we observe this is not the case for many multithreaded workloads. With region sizes of just 200--500 instructions, we observe a performance degradation ranging from 10\% to more than 50\% for workloads with a mixture of shared reads and writes. By harnessing the unique flexibility provided by the software subsystem of hybrid processor design, we present BlockChop, a framework for dynamically mitigating squashes on multicore hybrid processors. We present a range of squash handling mechanisms leveraging retrials, interpretation, and retranslation, and find that BlockChop is quite effective. Over the current response to exceptions and squashes in a hybrid design, we are able to improve the performance of benchmark and commercial workloads by 1.4x and 1.2x on average for large and small region sizes respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yoon:2012:DGM, author = "Doe Hyun Yoon and Min Kyu Jeong and Michael Sullivan and Mattan Erez", title = "The dynamic granularity memory system", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "548--559", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337222", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Chip multiprocessors enable continued performance scaling with increasingly many cores per chip. As the throughput of computation outpaces available memory bandwidth, however, the system bottleneck will shift to main memory. We present a memory system, the dynamic granularity memory system (DGMS), which avoids unnecessary data transfers, saves power, and improves system performance by dynamically changing between fine and coarse-grained memory accesses. DGMS predicts memory access granularities dynamically in hardware, and does not require software or OS support. The dynamic operation of DGMS gives it superior ease of implementation and power efficiency relative to prior multi-granularity memory systems, while maintaining comparable levels of system performance.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Aguilera:2012:AEW, author = "Marcos K. Aguilera and Dahlia Malkhi and Keith Marzullo and Alessandro Panconesi and Andrzej Pelc and Roger Wattenhofer", title = "Announcing the {2012 Edsger W. Dijkstra Prize in Distributed Computing}", journal = j-COMP-ARCH-NEWS, volume = "40", number = "4", pages = "1--2", month = sep, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2411116.2411118", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 11 08:06:57 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Maitra:2012:NAC, author = "Subhashis Maitra and Amitabha Sinha", title = "A new algorithm for computing triple-base number system", journal = j-COMP-ARCH-NEWS, volume = "40", number = "4", pages = "3--9", month = sep, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2411116.2411119", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 11 08:06:57 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We introduce here a generalized method a new Algorithm to find Triple-Base number system and Triple-Base chain and hence in turn Single Digit Triple-Base number system (SDTBNS). The proposed method is not only simpler and faster than the Algorithms to find Double-Base number system or Double-Base chain, experimentally it also returns a shorter length of Triple-Base chain which in turn reduces the size of the look-up-table to find out SDTBNS. The complexity analysis and experimental results shows the novelty of the proposed Algorithm. Moreover when the proposed method is applied to find scalar multiplication in case of Elliptic Curve Cryptography and coefficient multiplication in case of designing digital filter, its efficiency also proves its novelty. Here we have used third base as $5$ because when it is multiplied by $2$ gives $ 10$ which can be efficiently used for decimal shifting, i.e. if an integer '$n$' can be represented in SDTBNS form, then $ n / 10_x$ or $ n \times 10_x$ can also represented in SDTBNS only by diving or multiplying '$n$' by $ 10$.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kumar:2012:NLT, author = "Shiv Kumar and Seshadri Krishna Murthy and G. Varaprasad and S. Sivasathya", title = "Network load and traffic pattern on the capacity of wireless ad hoc networks", journal = j-COMP-ARCH-NEWS, volume = "40", number = "4", pages = "10--25", month = sep, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2411116.2411120", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 11 08:06:57 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper focuses on the capacity of wireless ad hoc networks and analyzes the effect of key factors viz. network size, traffic patterns and detailed local radio interactions on the capacity of such networks. The capacity is evaluated with several different network layouts and traffic patterns through simulations. To demonstrate the impact of these factors, the capacity evaluation starts with a simple case of a chain of evenly spaced nodes in a network environment and progresses to a network with random traffic and randomly spaced nodes. Initially, capacity of static nodes is evaluated for various network layouts and traffic patterns. Since, in most scenarios, nodes do not travel significant distances during packet transmissions. As an enhancement, mobility of nodes is introduced into the network scenario and the performance is again evaluated. The simulations are carried out using OPNET modeler and the results obtained are presented in this report. The results are analyzed to understand the impact of these factors on the capacity and consequently suggest measures to increase the same. This work shows that the achievable capacity of ad hoc network depends on network size, traffic pattern and mobility. In a single cell topology, it is found that there is a 50\% reduction in network throughput, if the node size increases from 2 to 10 nodes, whereas there is a 74\% reduction in the throughput for chain topology for the same increase in node size. In a lattice topology with horizontal traffic, there is a 46 \% reduction in network throughput when the lattice size increases from $ 4 \times 4 $ to $ 5 \times 5 $. The same percentage of reduction is observed when both horizontal and network traffic is introduced. In a random network topology with random traffic, there is an 80 \% reduction in network throughput when the node size increases from 150 to 750 nodes. However, for the same scenario with the introduction of mobility to the nodes, a slight improvement is achieved with an overall 75\% reduction in network throughput.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Isa:2012:EAS, author = "M. N. Isa and K. Benkrid and T. Clayton", title = "Efficient architecture and scheduling technique for pairwise sequence alignment", journal = j-COMP-ARCH-NEWS, volume = "40", number = "4", pages = "26--31", month = sep, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2411116.2411121", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 11 08:06:57 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A novel efficient hardware architecture to optimize the functions to be implemented on FPGAs. This riding curve of execution time of dynamic programming-based (DP) pairwise the process technology emerges the use of FPGAs in sequence alignment algorithms in hardware is proposed. It is realized by introducing an efficient overlapped scheduling of alignment matrix computation and substitution coefficients' pre-loading onto processing elements (PEs) in folded systolic arrays. A new metric is also proposed as an independent performance evaluator to compare different core implementations on different FPGA platforms fairly. Implementation results show that the new hardware architecture for sequence alignment achieves a minimum of 40 percent area normalized speed-up compared to the state-of-the-art hardware implementation, with the speed-up growing linearly with the number of folds e.g. 120 percent speed up for 16-fold. Compared to equivalent software implementations, the novel hardware architecture achieves a minimum of $ 103 \times $ speed-up, with the speed-up growing linearly with the number of folds e.g. $ 140 \times $ speed-up for 20-fold.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Oudjida:2012:NHR, author = "A. K. Oudjida and N. Chaillet and M. L. Berrandjia and A. Liacha", title = "A new high radix-2 $r$ ($ r \geq 8$) multibit recoding algorithm for large operand size ({$ N \geq 32$}) multipliers", journal = j-COMP-ARCH-NEWS, volume = "40", number = "4", pages = "32--43", month = sep, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2411116.2411122", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 11 08:06:57 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper addresses the problem of multiplication with large operand sizes ($ N \geq 32$). We propose a new recursive recoding algorithm that shortens the critical path of the multiplier and reduces the hardware complexity of partial-product-generators as well. The new recoding algorithm provides an optimal space/time partitioning of the multiplier architecture for any size $N$ of the operands. As a result, the critical path is drastically reduced to $ 3^3 \sqrt N / 2 - 3$ with no area overhead in comparison to modified Booth algorithm that shows a critical path of $ N / 2$ in adder stages. For instance, only $7$ adder stages are needed for a 64-bit two's complement multiplier. Confronted to reference algorithms for $ N = 64$, important gain ratios of $ 1.62$, $ 1.71$, $ 2.64$ are obtained in terms of multiply-time, energy consumption per multiply operation, and total gate count, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2012:INb, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "40", number = "4", pages = "44--48", month = sep, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2411116.2411124", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Dec 11 08:06:57 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This column consists of selected traffic from the {\tt comp.arch} newsgroup, a forum for discussion of computer architecture on the Internet---an international computer network. As always, the opinions expressed in this column are the personal views of the authors, and do not necessarily represent the institutions to which they are affiliated. Text which sets the context of a message appears underlined or in italics; this is usually text the author has quoted from earlier messages. The code-like expressions below the authors' names are their addresses on Internet.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Amano:2012:FBC, author = "Hideharu Amano and Wayne Luk", title = "{FPGA}-based {Connect6} solver with hardware-accelerated move refinement", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "4--9", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460218", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "HEART '12 conference proceedings.", abstract = "Connect6 is a two-player game similar to Go-Moku, which was introduced in 2003. Since placing two stones in each turn makes a huge game-tree, we require some acceleration techniques for a solver based on a typical approach to search the tree. This paper presents an FPGA-based Connect6 solver with two-level move refinement. The solver has the dedicated hardware to accelerate the move refinement by exploiting various parallelism with a systolic array, linear arrays, and multiple score-calculation units. Implementation with a low-end FPGA demonstrates that the accelerator allows the two-level move refinement in the FPGA-based solver running at 90 MHz to be 103695 and 414 times faster than equivalent software implementation with NIOS II soft processor on the FPGA and Intel Core i7 processor operating at 2.93 GHz, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chau:2012:RRP, author = "Thomas C. P. Chau and Wayne Luk and Peter Y. K. Cheung", title = "{Roberts}: reconfigurable platform for benchmarking real-time systems", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "10--15", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460219", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "HEART '12 conference proceedings.", abstract = "This paper presents Roberts, a Reconfigurable platfOrm for BEnchmarking Real-Time Systems. Roberts is the first platform which can be customised for a given system-under-test to support benchmarking of real-time properties and energy consumption. The benchmarking takes into account system workload and environmental events, with facilities for generating test vectors conforming to the specification of system under test, and with support for on-line monitoring of the response time, output values and energy consumption. The proposed benchmarking platform has been implemented in the DE4 development system to provide cycle-accurate timing measurement at nano-second precision to analyse high performance applications. An evaluation of our approach shows that the platform can be used in analysing the performance of target applications and overheads of other timing facilities, such as the interval timer on processors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kinoshita:2012:ARS, author = "Kei Kinoshita and Daisuke Takano and Tomoyuki Okamura and Tetsuhiko Yao and Yoshiki Yamaguchi", title = "An augmented reality system with a coarse-grained reconfigurable device", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "16--21", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460220", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "HEART '12 conference proceedings.", abstract = "Image recognition and motion tracking are widely utilized in the field of Augmented Reality (AR). Although their computational cost is huge, they enable to extend the practicality and the range of applications if all computation is processed within real time. Toward this goal, in this paper, we propose a handheld AR system optimized for direct hardware computation. It includes a subspace method for image recognition and a KLT tracking algorithm for motion tracking. The AR system is composed of one two-million-pixel-CCD-image sensor, one head-mounted display, one reconfigurable device called DAPDNA-2, and so on. DAPDNA-2 is a coarse-grained and dynamic-reconfigurable device which is produced by Tokyo Keiki Inc. The merit of DAPDNA-2 is its short-reconfiguration time and it is utilised to full for not only high performance but also the reduction of power consumption. The experimental result through a real Japanese-English translation system shows image recognition and motion tracking are computed within real-time; the computation time is less than 0.741 milliseconds per a VGA-resolution (640 x 480 pixels) frame. Thus, we are able to find a highly efficient computation using a coarse-grained architecture compared with general-purpose processors and embedded processors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ng:2012:STT, author = "Nicholas Ng and Nobuko Yoshida and Xin Yu Niu and Kuen Hung Tsoi", title = "Session types: towards safe and fast reconfigurable programming", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "22--27", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460221", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "HEART '12 conference proceedings.", abstract = "This paper introduces a new programming framework based on the theory of session types for safe, reconfigurable parallel designs. We apply the session type theory to C and Java programming languages and demonstrate that the session-based languages can offer a clear and tractable framework to describe communications between parallel components and guarantee communication-safety and deadlock-freedom by compile-time type checking. Many representative communication topologies such as a ring or scatter-gather can be programmed and verified in session-based programming languages. Case studies involving N-body simulation and Kmeans clustering are used to illustrate the session-based programming style and to demonstrate that the session-based languages perform competitively against MPI counterparts in an FPGA-based heterogeneous cluster, as well as the potential of integrating them with FPGA acceleration.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Syed:2012:LOA, author = "Rizwan Syed and Yajun Ha and Bharadwaj Veeravalli", title = "A low overhead abstract architecture for {FPGA} resource management", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "28--33", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460222", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "HEART '12 conference proceedings.", abstract = "To support dynamic applications, FPGAs will need to have a software operating system equivalent resource manager. An abstract FPGA architecture is the foundation to develop such an FPGA resource manager. Previous research projects work on the FPGA abstraction by abstracting the computing and/or the communication resources. However, various constraints made their proposals practically less useful due to the performance and/or the area overheads. We develop a low overhead abstract FPGA architecture that has the important features such as dynamically sized reconfigurable regions, deterministic communications among regions, clock network management and in-circuit debugging for regions. The architecture is demonstrated by implementing three applications on the Xilinx Virtex 5 FPGAs. We evaluate our work by comparing the area and performance overheads due to the abstractions between the abstracted and the non-abstracted applications. Experimental results show that additional resources required due to abstractions are found to be 6.4\% on average. This is achieved with low overheads on the timing performance.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tsoi:2012:MRS, author = "Kuen Hung Tsoi and Tobias Becker and Wayne Luk", title = "Modelling reconfigurable systems in event driven simulation", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "34--39", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460223", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "HEART '12 conference proceedings.", abstract = "Reconfigurable platforms allow hardware developers to customise their designs for specific applications. However, their adoption involves challenges in understanding and estimating the impact of various design parameters and approaches. This paper proposes a unified framework to model behaviour of reconfigurable systems using an event driven simulation approach. This provides an abstract yet informative method to capture both analytical relationships and empirical parameters of reconfigurable systems. It can be used to help making design decisions or verifying analytical models. We apply this approach to three models of reconfigurable applications to estimate the communication efficiency of networked clusters, and the performance and energy efficiency of runtime reconfigurable designs for software-defined radio and for option pricing in finance. The results show that, through this simulation framework, we can verify the accuracy of analytical models and also obtain practical information that is not provided by analytical models.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shun:2012:FAC, author = "Zheng Zhi Shun and Tsutomu Maruyama", title = "{FPGA} acceleration of {CDO} pricing based on correlation expansions", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "40--45", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460224", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "HEART '12 conference proceedings.", abstract = "Because of the significant growth in the financial market, faster and accurate pricing of widespread instruments is becoming more important. In this paper, we describe an FPGA implementation of an analytical method for collateralized debt obligation (CDO) pricing in the multifactor Normal Copula model. Our experiments show that the FPGA system is about 40 times faster than corresponding software on a single core 3 GHz Intel Core2 processor.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nakahara:2012:WFF, author = "Hiroki Nakahara and Hiroyuki Nakanishi and Tsutomu Sasao", title = "On a wideband {Fast Fourier Transform} for a radio telescope", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "46--51", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460225", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "HEART '12 conference proceedings.", abstract = "The radio telescope analyzes a radio frequency from celestial objects by using fast Fourier transform (FFT). In this application, its bandwidth f is wider than that of the typical FFT. Since the amount of hardware for the typical FFT circuit is proportional to the bandwidth f, a special technique is necessary for this application. This paper shows a realization of wideband FFT for the radio telescope on an FPGA. We show that the memory size for the conventional FFT, which consists of the twiddle factor memory and the transpose memory, is too large. We replace the twiddle factor memory with the pipelined CORDIC. To reduce the number of transpose memories, we increase the radix of the FFT from 22 to 2k, also we use the DDR2SDRAM to implement the transpose memory. We implement the 230-FFT on an Altera's Stratix IV GX530 FPGA. It performs the 230-FFT operations in 1.5 seconds. Compared with the Altera's FFT library, our FFT circuit realizes 214 times wider bandwidth on the same FPGA. Also, compared with Tesla S1070 utilizing four GPUs, our FFT circuit is faster and dissipates lower power.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ling:2012:HPP, author = "Cheng Ling and Khaled Benkrid and Tsuyoshi Hamada", title = "High performance phylogenetic analysis on {CUDA}-compatible {GPUs}", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "52--57", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460226", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "HEART '12 conference proceedings.", abstract = "The operation of phylogenetic analysis aims to investigate the evolution and relationships among species. It is widely used in the fields of system biology and comparative genomics. However, phylogenetic analysis is also a computationally intensive operation as the number of tree topology grows in a factorial way with the number of species involved. Therefore, due to the large number of species in the real world, the computational burden has largely thwarted phylogenetic reconstruction. In this paper, we describe the detailed GPU-based multi-threaded design and implementation of a Markov Chain Monte Carlo (MCMC) maximum likelihood algorithm for phylogenetic analysis on a set of aligned nucleotide sequences. The implementation is based on the framework of the most widely used phylogenetic analysis tool, namely MrBayes. The proposed approach resulted in 6x-8x speed-up on an NVidia Geforce 460 GTX GPU compared to an optimized GPP-based software implementation running on a desktop computer with a single Intel Xeon 2.53 GHz CPU and 6.0 GB RAM.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lin:2012:EED, author = "Colin Yu Lin and Hayden Kwok-Hay Kwok-Hay So", title = "Energy-efficient dataflow computations on {FPGAs} using application-specific coarse-grain architecture synthesis", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "58--63", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460227", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "HEART '12 conference proceedings.", abstract = "Compiling high-level user applications to execute on FPGA-based reconfigurable computers often involve synthesizing dataflow graphs beyond the capacity of the available hardware resources. A framework that provides rapid and energy-efficient compilation of such dataflow graphs on FPGAs using an array of pre-placed configurable processing elements is proposed. The mapping schedule of the compute operations on the CPEs and the direct network among the CPEs are co-synthesized on a per-application basis to provide the targeted power-performance tradeoff. Compared to the use of a fixed generic topology, the use of an application-specific topology derived by a genetic algorithm can achieve up to 28\% improvement in energy-delay product. As the CPEs are pre-placed, compiling for a new application involve only the generation of a new operation schedule, which is stored in on-chip memory, and the new routes among the CPEs. With optimization in operation scheduling and mapping and application-specific interconnect network, the proposed framework achieved up to 199X better energy-delay product compared to a traditional FPGA high-level synthesis tool xPilot. The use of such framework is anticipated to serve as part of a high-level application compiler for hybrid CPU-FPGA computation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Malik:2012:ERA, author = "Jamshaid Sarwar Malik and Paolo Palazzari and Ahmed Hemani", title = "Effort, resources, and abstraction vs performance in high-level synthesis: finding new answers to an old question", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "64--69", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460228", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "HEART '12 conference proceedings.", abstract = "This work provides new perspectives on impact of design effort, consumed resources and design abstraction on hardware performance in a high-level synthesis flow. We have shown that counter to published literature as well as intuition; more design effort may not always result in better performance. We developed a kernel that simulates Brownian motion, and investigated improvement in hardware performance with design effort at various abstraction levels. Our results indicate that a designer should be careful in putting more effort at a particular abstraction level. In our case, we achieved best performance/effort ratio at algorithm level rather than lower abstraction levels. This strongly suggests that design effort is not always proportional to corresponding improvement in performance.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kakimoto:2012:PCG, author = "Takeshi Kakimoto and Keisuke Dohi and Yuichiro Shibata and Kiyoshi Oguri", title = "Performance comparison of {GPU} programming frameworks with the striped {Smith--Waterman} algorithm", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "70--75", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460229", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "HEART '12 conference proceedings.", abstract = "This paper evaluates and discusses how different GPU programming frameworks affect the performance obtained from GPU acceleration of the striped smith-waterman algorithm used for biological sequence alignment. A total of 6 GPU implementations of the algorithm on NVIDIA GT200b and AMD RV870 using the CUDA and the OpenCL frameworks are compared to analyze cons and pros of explicit descriptions for architecture specific hardware mechanisms in the code. The evaluation results show that the primitive descriptions with the CUDA are still efficient especially for small size data, while better instruction scheduling and optimizations are carried out by the OpenCL compiler. On the other hand, the combination of OpenCL and RV870 which provides a relatively simple view of the architecture is efficient for the large data size.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tribino:2012:PPA, author = "Julien Tribino and Antoine Trouv{\'e} and Hadrien A. Clarke and Kazuaki J. Murakami", title = "{PASTIS}: a photonic arbitration with scalable token injection scheme", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "76--81", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460230", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "HEART '12 conference proceedings.", abstract = "This paper introduces PASTIS, a novel photonic arbitration protocol based on a scalable token injection scheme, and ring-based nanophotonic technology. It aims at connecting together processors and memories in many-core computer systems by means of a ring topology. The main strength of PASTIS lays in the fact that it uses photonic components exclusively, that is, routing does not require any electronics. In this work, we compare it with an hybrid opto-electronic protocol as presented in a related work. Simulations show that PASTIS performs better in terms of bandwidth, latency and energy consumption. Indeed, it is scalable as it can adapt its bandwidth to the system's workload, thereby saving energy. Finally, we also study the opportunity of using reconfigurable rings. We determine that they almost halve the overall static power consumption.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Watanabe:2012:MCP, author = "Takahiro Watanabe and Minoru Watanabe", title = "$ 0.18 \mu $ m {CMOS} process high-sensitivity optically reconfigurable gate array {VLSI}", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "82--86", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460231", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "HEART '12 conference proceedings.", abstract = "Currently, demand for high-speed dynamic reconfiguration of a programmable device is increasing for the purpose of increasing the performance of such devices. To support the high speed dynamic reconfiguration, optically reconfigurable gate arrays (ORGAs) have been developed up to now. An ORGA consists of a holographic memory, a laser array, and an optically reconfigurable gate array VLSI. The holographic memory can store many configuration contexts. In addition, its large bandwidth optical connection enables high speed reconfiguration. However, photodiode sensitivities of conventional ORGAs were not good. This paper therefore presents a newly fabricated $ 0.18 \pi $ m CMOS process optically reconfigurable gate array VLSI chip with highly sensitive photociruits.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nakaya:2012:NVR, author = "Shogo Nakaya and Makoto Miyamura and Noboru Sakimura and Yuichi Nakamura and Tadahiko Sugibayashi", title = "A non-volatile reconfigurable offloader for wireless sensor nodes", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "87--92", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460232", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "HEART '12 conference proceedings.", abstract = "Energy saving is currently one of the most important issues in the development of battery-powered wireless sensor nodes (WSNs). We have developed a non-volatile reconfigurable offloader for flexible and highly efficient processing on WSNs that uses NanoBridges (NBs), which are novel non-volatile and reprogrammable switching elements. Non-volatility is essential for the intermittent operation of WSNs due to the requirement of power-on without loading configuration data. We implemented a data compression algorithm on the offloader that reduces energy consumption during data transmission. Simulation results showed that the energy consumption on the offloader was $ 11 / 21 $ of that on an ultra-low power CPU.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2012:INc, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "93--112", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460234", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "HEART '12 conference proceedings.", abstract = "This column consists of selected traffic from the comp.arch newsgroup, a forum for discussion of computer architecture on the Internet---an international computer network. As always, the opinions expressed in this column are the personal views of the authors, and do not necessarily represent the institutions to which they are affiliated. Text which sets the context of a message appears underlined or in italics; this is usually text the author has quoted from earlier messages. The code-like expressions below the authors' names are their addresses on Internet.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bond:2013:GDG, author = "Michael Bond", title = "{GPUDet}: a deterministic {GPU} architecture", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "1--12", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451118", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Nondeterminism is a key challenge in developing multithreaded applications. Even with the same input, each execution of a multithreaded program may produce a different output. This behavior complicates debugging and limits one's ability to test for correctness. This non-reproducibility situation is aggravated on massively parallel architectures like graphics processing units (GPUs) with thousands of concurrent threads. We believe providing a deterministic environment to ease debugging and testing of GPU applications is essential to enable a broader class of software to use GPUs. Many hardware and software techniques have been proposed for providing determinism on general-purpose multi-core processors. However, these techniques are designed for small numbers of threads. Scaling them to thousands of threads on a GPU is a major challenge. This paper proposes a scalable hardware mechanism, GPUDet, to provide determinism in GPU architectures. In this paper we characterize the existing deterministic and nondeterministic aspects of current GPU execution models, and we use these observations to inform GPUDet's design. For example, GPUDet leverages the inherent determinism of the SIMD hardware in GPUs to provide determinism within a wavefront at no cost. GPUDet also exploits the Z-Buffer Unit, an existing GPU hardware unit for graphics rendering, to allow parallel out-of-order memory writes to produce a deterministic output. Other optimizations in GPUDet include deterministic parallel execution of atomic operations and a workgroup-aware algorithm that eliminates unnecessary global synchronizations. Our simulation results indicate that GPUDet incurs only 2X slowdown on average over a baseline nondeterministic architecture, with runtime overheads as low as 4\% for compute-bound applications, despite running GPU kernels with thousands of threads. We also characterize the sources of overhead for deterministic execution on GPUs to provide insights for further optimizations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Sung:2013:DEH, author = "Hyojin Sung and Rakesh Komuravelli and Sarita V. Adve", title = "{DeNovoND}: efficient hardware support for disciplined non-determinism", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "13--26", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451119", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Recent work has shown that disciplined shared-memory programming models that provide deterministic-by-default semantics can simplify both parallel software and hardware. Specifically, the DeNovo hardware system has shown that the software guarantees of such models (e.g., data-race-freedom and explicit side-effects) can enable simpler, higher performance, and more energy-efficient hardware than the current state-of-the-art for deterministic programs. Many applications, however, contain non-deterministic parts; e.g., using lock synchronization. For commercial hardware to exploit the benefits of DeNovo, it is therefore necessary to extend DeNovo to support non-deterministic applications. This paper proposes DeNovoND, a system that supports lock-based, disciplined non-determinism, with the simplicity, performance, and energy benefits of DeNovo. We use a combination of distributed queue-based locks and access signatures to implement simple memory consistency semantics for safe non-determinism, with a coherence protocol that does not require transient states, invalidation traffic, or directories, and does not incur false sharing. The resulting system is simpler, shows comparable or better execution time, and has 33\% less network traffic on average (translating directly into energy savings) relative to a state-of-the-art invalidation-based protocol for 8 applications designed for lock synchronization.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Wester:2013:PDR, author = "Benjamin Wester and David Devecsery and Peter M. Chen and Jason Flinn and Satish Narayanasamy", title = "Parallelizing data race detection", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "27--38", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451120", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Detecting data races in multithreaded programs is a crucial part of debugging such programs, but traditional data race detectors are too slow to use routinely. This paper shows how to speed up race detection by spreading the work across multiple cores. Our strategy relies on uniparallelism, which executes time intervals of a program (called epochs ) in parallel to provide scalability, but executes all threads from a single epoch on a single core to eliminate locking overhead. We use several techniques to make parallelization effective: dividing race detection into three phases, predicting a subset of the analysis state, eliminating sequential work via transitive reduction, and reducing the work needed to maintain multiple versions of analysis via factorization. We demonstrate our strategy by parallelizing a happens-before detector and a lockset-based detector. We find that uniparallelism can significantly speed up data race detection. With 4x the number of cores as the original application, our strategy speeds up the median execution time by 4.4x for a happens-before detector and 3.3x for a lockset race detector. Even on the same number of cores as the conventional detectors, the ability for uniparallelism to elide analysis locks allows it to reduce the median overhead by 13\% for a happens-before detector and 8\% for a lockset detector.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Lucia:2013:CEF, author = "Brandon Lucia and Luis Ceze", title = "Cooperative empirical failure avoidance for multithreaded programs", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "39--50", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451121", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Concurrency errors in multithreaded programs are difficult to find and fix. We propose Aviso, a system for avoiding schedule-dependent failures. Aviso monitors events during a program's execution and, when a failure occurs, records a history of events from the failing execution. It uses this history to generate schedule constraints that perturb the order of events in the execution and thereby avoids schedules that lead to failures in future program executions. Aviso leverages scenarios where many instances of the same software run, using a statistical model of program behavior and experimentation to determine which constraints most effectively avoid failures. After implementing Aviso, we showed that it decreased failure rates for a variety of important desktop, server, and cloud applications by orders of magnitude, with an average overhead of less than 20\% and, in some cases, as low as 5\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Goiri:2013:PGM, author = "{\'I}{\~n}igo Goiri and William Katsak and Kien Le and Thu D. Nguyen and Ricardo Bianchini", title = "{Parasol} and {GreenSwitch}: managing datacenters powered by renewable energy", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "51--64", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451123", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Several companies have recently announced plans to build ``green'' datacenters, i.e. datacenters partially or completely powered by renewable energy. These datacenters will either generate their own renewable energy or draw it directly from an existing nearby plant. Besides reducing carbon footprints, renewable energy can potentially reduce energy costs, reduce peak power costs, or both. However, certain renewable fuels are intermittent, which requires approaches for tackling the energy supply variability. One approach is to use batteries and/or the electrical grid as a backup for the renewable energy. It may also be possible to adapt the workload to match the renewable energy supply. For highest benefits, green datacenter operators must intelligently manage their workloads and the sources of energy at their disposal. In this paper, we first discuss the tradeoffs involved in building green datacenters today and in the future. Second, we present Parasol, a prototype green datacenter that we have built as a research platform. Parasol comprises a small container, a set of solar panels, a battery bank, and a grid-tie. Third, we describe GreenSwitch, our model-based approach for dynamically scheduling the workload and selecting the source of energy to use. Our real experiments with Parasol, GreenSwitch, and MapReduce workloads demonstrate that intelligent workload and energy source management can produce significant cost reductions. Our results also isolate the cost implications of peak power management, storing energy on the grid, and the ability to delay the MapReduce jobs. Finally, our results demonstrate that careful workload and energy source management can minimize the negative impact of electrical grid outages.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Shen:2013:PCF, author = "Kai Shen and Arrvindh Shriraman and Sandhya Dwarkadas and Xiao Zhang and Zhuan Chen", title = "Power containers: an {OS} facility for fine-grained power and energy management on multicore servers", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "65--76", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451124", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Energy efficiency and power capping are critical concerns in server and cloud computing systems. They face growing challenges due to dynamic power variations from new client-directed web applications, as well as complex behaviors due to multicore resource sharing and hardware heterogeneity. This paper presents a new operating system facility called ``power containers'' that accounts for and controls the power and energy usage of individual fine-grained requests in multicore servers. This facility relies on three key techniques --- (1) online model that attributes multicore power (including shared maintenance power) to concurrently running tasks, (2) alignment of actual power measurements and model estimates to enable online model recalibration, and (3) on-the-fly application-transparent request tracking in multi-stage servers to isolate the power and energy contributions and customize per-request control. Our mechanisms enable new multicore server management capabilities including fair power capping that only penalizes power-hungry requests, and energy-aware request distribution between heterogeneous servers. Our evaluation uses three multicore processors (Intel Woodcrest, Westmere, and SandyBridge) and a variety of server and cloud computing (Google App Engine) workloads. Our results demonstrate the high accuracy of our request power accounting (no more than 11\% errors) and the effectiveness of container-enabled power virus isolation and throttling. Our request distribution case study shows up to 25\% energy saving compared to an alternative approach that recognizes machine heterogeneity but not fine-grained workload affinity.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Delimitrou:2013:PQA, author = "Christina Delimitrou and Christos Kozyrakis", title = "{Paragon}: {QoS}-aware scheduling for heterogeneous datacenters", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "77--88", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451125", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Large-scale datacenters (DCs) host tens of thousands of diverse applications each day. However, interference between colocated workloads and the difficulty to match applications to one of the many hardware platforms available can degrade performance, violating the quality of service (QoS) guarantees that many cloud workloads require. While previous work has identified the impact of heterogeneity and interference, existing solutions are computationally intensive, cannot be applied online and do not scale beyond few applications. We present Paragon, an online and scalable DC scheduler that is heterogeneity and interference-aware. Paragon is derived from robust analytical methods and instead of profiling each application in detail, it leverages information the system already has about applications it has previously seen. It uses collaborative filtering techniques to quickly and accurately classify an unknown, incoming workload with respect to heterogeneity and interference in multiple shared resources, by identifying similarities to previously scheduled applications. The classification allows Paragon to greedily schedule applications in a manner that minimizes interference and maximizes server utilization. Paragon scales to tens of thousands of servers with marginal scheduling overheads in terms of time or state. We evaluate Paragon with a wide range of workload scenarios, on both small and large-scale systems, including 1,000 servers on EC2. For a 2,500-workload scenario, Paragon enforces performance guarantees for 91\% of applications, while significantly improving utilization. In comparison, heterogeneity-oblivious, interference-oblivious and least-loaded schedulers only provide similar guarantees for 14\%, 11\% and 3\% of workloads. The differences are more striking in oversubscribed scenarios where resource efficiency is more critical.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Tang:2013:RRS, author = "Lingjia Tang and Jason Mars and Wei Wang and Tanima Dey and Mary Lou Soffa", title = "{ReQoS}: reactive static\slash dynamic compilation for {QoS} in warehouse scale computers", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "89--100", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451126", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As multicore processors with expanding core counts continue to dominate the server market, the overall utilization of the class of datacenters known as warehouse scale computers (WSCs) depends heavily on colocation of multiple workloads on each server to take advantage of the computational power provided by modern processors. However, many of the applications running in WSCs, such as websearch, are user-facing and have quality of service (QoS) requirements. When multiple applications are co-located on a multicore machine, contention for shared memory resources threatens application QoS as severe cross-core performance interference may occur. WSC operators are left with two options: either disregard QoS to maximize WSC utilization, or disallow the co-location of high-priority user-facing applications with other applications, resulting in low machine utilization and millions of dollars wasted. This paper presents ReQoS, a static/dynamic compilation approach that enables low-priority applications to adaptively manipulate their own contentiousness to ensure the QoS of high-priority co-runners. ReQoS is composed of a profile guided compilation technique that identifies and inserts markers in contentious code regions in low-priority applications, and a lightweight runtime that monitors the QoS of high-priority applications and reactively reduces the pressure low-priority applications generate to the memory subsystem when cross-core interference is detected. In this work, we show that ReQoS can accurately diagnose contention and significantly reduce performance interference to ensure application QoS. Applying ReQoS to SPEC2006 and SmashBench workloads on real multicore machines, we are able to improve machine utilization by more than 70\% in many cases, and more than 50\% on average, while enforcing a 90\% QoS threshold. We are also able to improve the energy efficiency of modern multicore machines by 47\% on average over a policy of disallowing co-locations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Arulraj:2013:PRS, author = "Joy Arulraj and Po-Chun Chang and Guoliang Jin and Shan Lu", title = "Production-run software failure diagnosis via hardware performance counters", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "101--112", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451128", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Sequential and concurrency bugs are widespread in deployed software. They cause severe failures and huge financial loss during production runs. Tools that diagnose production-run failures with low overhead are needed. The state-of-the-art diagnosis techniques use software instrumentation to sample program properties at run time and use off-line statistical analysis to identify properties most correlated with failures. Although promising, these techniques suffer from high run-time overhead, which is sometimes over 100\%, for concurrency-bug failure diagnosis and hence are not suitable for production-run usage. We present PBI, a system that uses existing hardware performance counters to diagnose production-run failures caused by sequential and concurrency bugs with low overhead. PBI is designed based on several key observations. First, a few widely supported performance counter events can reflect a wide variety of common software bugs and can be monitored by hardware with almost no overhead. Second, the counter overflow interrupt supported by existing hardware and operating systems provides a natural and effective mechanism to conduct event sampling at user level. Third, the noise and non-determinism in interrupt delivery complements well with statistical processing. We evaluate PBI using 13 real-world concurrency and sequential bugs from representative open-source server, client, and utility programs, and 10 bugs from a widely used software-testing benchmark. Quantitatively, PBI can effectively diagnose failures caused by these bugs with a small overhead that is never higher than 10\%. Qualitatively, PBI does not require any change to software and presents a novel use of existing hardware performance counters.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Zhang:2013:CFC, author = "Wei Zhang and Marc de Kruijf and Ang Li and Shan Lu and Karthikeyan Sankaralingam", title = "{ConAir}: featherweight concurrency bug recovery via single-threaded idempotent execution", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "113--126", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451129", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Many concurrency bugs are hidden in deployed software and cause severe failures for end-users. When they finally manifest and become known by developers, they are difficult to fix correctly. To support end-users, we need techniques that help software survive hidden concurrency bugs during production runs. To help developers, we need techniques that fix exposed concurrency bugs. The state-of-the-art techniques on concurrency-bug fixing and survival only satisfy a subset of four important properties: compatibility, correctness, generality, and performance.We aim to develop a system that satisfies all of these four properties. To achieve this goal, we leverage two observations: (1) rolling back a single thread is sufficient to recover from most concurrency-bug failures; (2) reexecuting an idempotent region, which requires no memory-state checkpoint, is sufficient to recover from many concurrency-bug failures. Our system ConAir includes a static analysis component that automatically identifies potential failure sites, a static analysis component that automatically identifies the idempotent code regions around every failure site, and a code-transformation component that inserts rollback-recovery code around the identified idempotent regions. We evaluated ConAir on 10 real-world concurrency bugs in widely used C/C++ open-source applications. These bugs cover different types of failure symptoms and root causes. Quantitatively, ConAir helps software survive failures caused by all of these bugs with negligible run-time overhead ({$<$1}\%) and short recovery time. Qualitatively, ConAir can help recover from failures caused by unknown bugs. It guarantees that program semantics remain unchanged and requires no change to operating systems or hardware.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Viennot:2013:TMR, author = "Nicolas Viennot and Siddharth Nair and Jason Nieh", title = "Transparent mutable replay for multicore debugging and patch validation", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "127--138", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451130", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We present Dora, a mutable record-replay system which allows a recorded execution of an application to be replayed with a modified version of the application. This feature, not available in previous record-replay systems, enables powerful new functionality. In particular, Dora can help reproduce, diagnose, and fix software bugs by replaying a version of a recorded application that is recompiled with debugging information, reconfigured to produce verbose log output, modified to include additional print statements, or patched to fix a bug. Dora uses lightweight operating system mechanisms to record an application execution by capturing nondeterministic events to a log without imposing unnecessary timing and ordering constraints. It replays the log using a modified version of the application even in the presence of added, deleted, or modified operations that do not match events in the log. Dora searches for a replay that minimizes differences between the log and the replayed execution of the modified program. If there are no modifications, Dora provides deterministic replay of the unmodified program. We have implemented a Linux prototype which provides transparent mutable replay without recompiling or relinking applications. We show that Dora is useful for reproducing, diagnosing, and fixing software bugs in real-world applications, including Apache and MySQL. Our results show that Dora (1) captures bugs and replays them with applications modified or reconfigured to produce additional debugging output for root cause diagnosis, (2) captures exploits and replays them with patched applications to validate that the patches successfully eliminate vulnerabilities, (3) records production workloads and replays them with patched applications to validate patches with realistic workloads, and (4) maintains low recording overhead on commodity multicore hardware, making it suitable for production systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Sahoo:2013:ULI, author = "Swarup Kumar Sahoo and John Criswell and Chase Geigle and Vikram Adve", title = "Using likely invariants for automated software fault localization", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "139--152", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451131", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We propose an automatic diagnosis technique for isolating the root cause(s) of software failures. We use likely program invariants, automatically generated using correct inputs that are close to the fault-triggering input, to select a set of candidate program locations which are possible root causes. We then trim the set of candidate root causes using software-implemented dynamic backwards slicing, plus two new filtering heuristics: dependence filtering, and filtering via multiple failing inputs that are also close to the failing input. Experimental results on reported software bugs of three large open-source servers show that we are able to narrow down the number of candidate bug locations to between 5 and 17 program expressions, even in programs that are hundreds of thousands of lines long.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Paulos:2013:REA, author = "Eric Paulos", title = "The rise of the expert amateur: {DIY} culture and the evolution of computer science", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "153--154", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451133", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We are at an important technological inflection point. Most of our computing systems have been designed and built by professionally trained experts (i.e. us --- computer scientists, engineers, and designers) for use in specific domains and to solve explicit problems. Artifacts often called ``user manuals'' traditionally prescribed the appropriate usage of these tools and implied an acceptable etiquette for interaction and experience. A fringe group of individuals usually labeled ``hackers'' or ``amateurs'' or ``makers'' have challenged this producer-consumer model of technology by creating novel hardware and software features to ``improve'' our research and products while a similar creative group of technicians called ``artists'' have redirected the techniques, tools, and tenets of accepted technological usage away from their typical manifestations in practicality and product. Over time the technological artifacts of these fringe groups and the support for their rhetoric have gained them a foothold into computing culture and eroded the established power discontinuities within the practice of computing research. We now expect our computing tools to be driven by an architecture of open participation and democracy that encourages users to add value to their tools and applications as they use them. Similarly, the bar for enabling the design of novel, personal computing systems and ``hardware remixes'' has fallen to the point where many non-experts and novices are readily embracing and creating fascinating and ingenious computing artifacts outside of our official and traditionally sanctioned academic and industrial research communities. But how have we as ``expert'' practitioners been influencing this discussion? By constructing a practice around the design and development of technology for task based and problem solving applications, we have unintentionally established such work as the status quo for the human computing experience. We have failed in our duty to open up alternate forums for technology to express itself and touch our lives beyond productivity and efficiency. Blinded by our quest for ``smart technologies'' we have forgotten to contemplate the design of technologies to inspire us to be smarter, more curious, and more inquisitive. We owe it to ourselves to rethink the impact we desire to have on this historic moment in computing culture. We must choose to participate in and perhaps lead a dialogue that heralds an expansive new acceptable practice of designing to enable participation by experts and non-experts alike. We are in the milieu of the rise of the ``expert amateur''. We must change our mantra --- not just performance, completeness, and usability but openness, usefulness and relevancy to our world, its citizens, and our environment. This talk will explore elements of the DIY and maker culture and its relevancy to research questions across computational hardware, languages, and systems. Ultimately, this talk will outline and argue for expanding the design territory and potential opportunities for all of us to collaborate and benefit as a society from this cultural movement.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Raghavan:2013:CSH, author = "Arun Raghavan and Laurel Emurian and Lei Shao and Marios Papaefthymiou and Kevin P. Pipe and Thomas F. Wenisch and Milo M. K. Martin", title = "Computational sprinting on a hardware\slash software testbed", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "155--166", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451135", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "CMOS scaling trends have led to an inflection point where thermal constraints (especially in mobile devices that employ only passive cooling) preclude sustained operation of all transistors on a chip --- a phenomenon called ``dark silicon.'' Recent research proposed computational sprinting --- exceeding sustainable thermal limits for short intervals --- to improve responsiveness in light of the bursty computation demands of many media-rich interactive mobile applications. Computational sprinting improves responsiveness by activating reserve cores (parallel sprinting) and/or boosting frequency/voltage (frequency sprinting) to power levels that far exceed the system's sustainable cooling capabilities, relying on thermal capacitance to buffer heat. Prior work analyzed the feasibility of sprinting through modeling and simulation. In this work, we investigate sprinting using a hardware/software testbed. First, we study unabridged sprints, wherein the computation completes before temperature becomes critical, demonstrating a 6.3x responsiveness gain, and a 6\% energy efficiency improvement by racing to idle. We then analyze truncated sprints, wherein our software runtime system must intervene to prevent overheating by throttling parallelism and frequency before the computation is complete. To avoid oversubscription penalties (context switching inefficiencies after a truncated parallel sprint), we develop a sprint-aware task-based parallel runtime. We find that maximal-intensity sprinting is not always best, introduce the concept of sprint pacing, and evaluate an adaptive policy for selecting sprint intensity. We report initial results using a phase change heat sink to extend maximum sprint duration. Finally, we demonstrate that a sprint-and-rest operating regime can actually outperform thermally-limited sustained execution.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Ahn:2013:DAS, author = "Wonsun Ahn and Yuelu Duan and Josep Torrellas", title = "{DeAliaser}: alias speculation using atomic region support", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "167--180", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451136", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Alias analysis is a critical component in many compiler optimizations. A promising approach to reduce the complexity of alias analysis is to use speculation. The approach consists of performing optimizations assuming the alias relationships that are true most of the time, and repairing the code when such relationships are found not to hold through runtime checks. This paper proposes a general alias speculation scheme that leverages upcoming hardware support for transactions with the help of some ISA extensions. The ability of transactions to checkpoint and roll back frees the compiler to pursue aggressive optimizations without having to worry about recovery code. Also, exposing the memory conflict detection hardware in transactions to software allows runtime checking of aliases with little or no overhead. We test the potential of the novel alias speculation approach with Loop Invariant Code Motion (LICM), Global Value Numbering (GVN), and Partial Redundancy Elimination (PRE) optimization passes. On average, they are shown to reduce program execution time by 9\% in SPEC FP2006 applications and 3\% in SPEC INT2006 applications over the alias analysis of a state-of-the-art compiler.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Park:2013:RCH, author = "Heekwon Park and Seungjae Baek and Jongmoo Choi and Donghee Lee and Sam H. Noh", title = "Regularities considered harmful: forcing randomness to memory accesses to reduce row buffer conflicts for multi-core, multi-bank systems", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "181--192", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451137", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We propose a novel kernel-level memory allocator, called M$^3$ (M-cube, Multi-core Multi-bank Memory allocator), that has the following two features. First, it introduces and makes use of a notion of a memory container, which is defined as a unit of memory that comprises the minimum number of page frames that can cover all the banks of the memory organization, by exclusively assigning a container to a core so that each core achieves bank parallelism as much as possible. Second, it orchestrates page frame allocation so that pages that threads access are dispersed randomly across multiple banks so that each thread's access pattern is randomized. The development of M$^3$ is based on a tool that we develop to fully understand the architectural characteristics of the underlying memory organization. Using an extension of this tool, we observe that the same application that accesses pages in a random manner outperforms one that accesses pages in a regular pattern such as sequential or same ordered accesses. This is because such randomized accesses reduces inter-thread access interference on the row-buffer in memory banks. We implement M$^3$ in the Linux kernel version 2.6.32 on the Intel Xeon system that has 16 cores and 32GB DRAM. Performance evaluation with various workloads show that M$^3$ improves the overall performance for memory intensive benchmarks by up to 85\% with an average of about 40\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Honarmand:2013:CUA, author = "Nima Honarmand and Nathan Dautenhahn and Josep Torrellas and Samuel T. King and Gilles Pokam and Cristiano Pereira", title = "{Cyrus}: unintrusive application-level record-replay for replay parallelism", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "193--206", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451138", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Architectures for deterministic record-replay (R\&R) of multithreaded code are attractive for program debugging, intrusion analysis, and fault-tolerance uses. However, very few of the proposed designs have focused on maximizing replay speed --- a key enabling property of these systems. The few efforts that focus on replay speed require intrusive hardware or software modifications, or target whole-system R\&R rather than the more useful application-level R\&R. This paper presents the first hardware-based scheme for unintrusive, application-level R\&R that explicitly targets high replay speed. Our scheme, called Cyrus, requires no modification to commodity snoopy cache coherence. It introduces the concept of an on-the-fly software Backend Pass during recording which, as the log is being generated, transforms it for high replay parallelism. This pass also fixes-up the log, and can flexibly trade-off replay parallelism for log size. We analyze the performance of Cyrus using full system (OS plus hardware) simulation. Our results show that Cyrus has negligible recording overhead. In addition, for 8-processor runs of SPLASH-2, Cyrus attains an average replay parallelism of 5, and a replay speed that is, on average, only about 50\% lower than the recording speed.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{deOliveira:2013:WYS, author = "Augusto Born de Oliveira and Sebastian Fischmeister and Amer Diwan and Matthias Hauswirth and Peter F. Sweeney", title = "Why you should care about quantile regression", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "207--218", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451140", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Research has shown that correctly conducting and analysing computer performance experiments is difficult. This paper investigates what is necessary to conduct successful computer performance evaluation by attempting to repeat a prior experiment: the comparison between two Linux schedulers. In our efforts, we found that exploring an experimental space through a series of incremental experiments can be inconclusive, and there may be no indication of how much experimentation will be enough. Analysis of variance (ANOVA), a traditional analysis method, is able to partly solve the problems with the previous approach, but we demonstrate that ANOVA can be insufficient for proper analysis due to the requirements it imposes on the data. Finally, we demonstrate the successful application of quantile regression, a recent development in statistics, to computer performance experiments. Quantile regression can provide more insight into the experiment than ANOVA, with the additional benefit of being applicable to data from any distribution. This property makes it especially useful in our field, since non-normally distributed data is common in computer experiments.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Curtsinger:2013:SSS, author = "Charlie Curtsinger and Emery D. Berger", title = "{STABILIZER}: statistically sound performance evaluation", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "219--228", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451141", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Researchers and software developers require effective performance evaluation. Researchers must evaluate optimizations or measure overhead. Software developers use automatic performance regression tests to discover when changes improve or degrade performance. The standard methodology is to compare execution times before and after applying changes. Unfortunately, modern architectural features make this approach unsound. Statistically sound evaluation requires multiple samples to test whether one can or cannot (with high confidence) reject the null hypothesis that results are the same before and after. However, caches and branch predictors make performance dependent on machine-specific parameters and the exact layout of code, stack frames, and heap objects. A single binary constitutes just one sample from the space of program layouts, regardless of the number of runs. Since compiler optimizations and code changes also alter layout, it is currently impossible to distinguish the impact of an optimization from that of its layout effects. This paper presents Stabilizer, a system that enables the use of the powerful statistical techniques required for sound performance evaluation on modern architectures. Stabilizer forces executions to sample the space of memory configurations by repeatedly re-randomizing layouts of code, stack, and heap objects at runtime. Stabilizer thus makes it possible to control for layout effects. Re-randomization also ensures that layout effects follow a Gaussian distribution, enabling the use of statistical tests like ANOVA. We demonstrate Stabilizer's efficiency ({$<$7}\% median overhead) and its effectiveness by evaluating the impact of LLVM's optimizations on the SPEC CPU2006 benchmark suite. We find that, while -O2 has a significant impact relative to -O1, the performance impact of -O3 over -O2 optimizations is indistinguishable from random noise.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Gidra:2013:SSS, author = "Lokesh Gidra and Ga{\"e}l Thomas and Julien Sopena and Marc Shapiro", title = "A study of the scalability of stop-the-world garbage collectors on multicores", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "229--240", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451142", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Large-scale multicore architectures create new challenges for garbage collectors (GCs). In particular, throughput-oriented stop-the-world algorithms demonstrate good performance with a small number of cores, but have been shown to degrade badly beyond approximately 8 cores on a 48-core with OpenJDK 7. This negative result raises the question whether the stop-the-world design has intrinsic limitations that would require a radically different approach. Our study suggests that the answer is no, and that there is no compelling scalability reason to discard the existing highly-optimised throughput-oriented GC code on contemporary hardware. This paper studies the default throughput-oriented garbage collector of OpenJDK 7, called Parallel Scavenge. We identify its bottlenecks, and show how to eliminate them using well-established parallel programming techniques. On the SPECjbb2005, SPECjvm2008 and DaCapo 9.12 benchmarks, the improved GC matches the performance of Parallel Scavenge at low core count, but scales well, up to 48~cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{McFarlin:2013:DDO, author = "Daniel S. McFarlin and Charles Tucker and Craig Zilles", title = "Discerning the dominant out-of-order performance advantage: is it speculation or dynamism?", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "241--252", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451143", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper, we set out to study the performance advantages of an Out-of-Order (OOO) processor relative to in-order processors with similar execution resources. In particular, we try to tease apart the performance contributions from two sources: the improved schedules enabled by OOO hardware speculation support and its ability to generate different schedules on different occurrences of the same instructions based on operand and functional unit availability. We find that the ability to express good static schedules achieves the bulk of the speedup resulting from OOO. Specifically, of the 53\% speedup achieved by OOO relative to a similarly provisioned in- order machine, we find that 88\% of that speedup can be achieved by using a single ``best'' static schedule as suggested by observing an OOO schedule of the code. We discuss the ISA mechanisms that would be required to express these static schedules. Furthermore, we find that the benefits of dynamism largely come from two kinds of events that influence the application's critical path: load instructions that miss in the cache only part of the time and branch mispredictions. We find that much of the benefit of OOO dynamism can be achieved by the potentially simpler task of addressing these two behaviors directly.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Checkoway:2013:IAW, author = "Stephen Checkoway and Hovav Shacham", title = "{Iago} attacks: why the system call {API} is a bad untrusted {RPC} interface", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "253--264", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451145", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In recent years, researchers have proposed systems for running trusted code on an untrusted operating system. Protection mechanisms deployed by such systems keep a malicious kernel from directly manipulating a trusted application's state. Under such systems, the application and kernel are, conceptually, peers, and the system call API defines an RPC interface between them. We introduce Iago attacks, attacks that a malicious kernel can mount in this model. We show how a carefully chosen sequence of integer return values to Linux system calls can lead a supposedly protected process to act against its interests, and even to undertake arbitrary computation at the malicious kernel's behest. Iago attacks are evidence that protecting applications from malicious kernels is more difficult than previously realized.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Hofmann:2013:ISA, author = "Owen S. Hofmann and Sangman Kim and Alan M. Dunn and Michael Z. Lee and Emmett Witchel", title = "{InkTag}: secure applications on an untrusted operating system", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "265--278", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451146", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "InkTag is a virtualization-based architecture that gives strong safety guarantees to high-assurance processes even in the presence of a malicious operating system. InkTag advances the state of the art in untrusted operating systems in both the design of its hypervisor and in the ability to run useful applications without trusting the operating system. We introduce paraverification, a technique that simplifies the InkTag hypervisor by forcing the untrusted operating system to participate in its own verification. Attribute-based access control allows trusted applications to create decentralized access control policies. InkTag is also the first system of its kind to ensure consistency between secure data and metadata, ensuring recoverability in the face of system crashes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Giuffrida:2013:SAL, author = "Cristiano Giuffrida and Anton Kuijsten and Andrew S. Tanenbaum", title = "Safe and automatic live update for operating systems", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "279--292", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451147", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Increasingly many systems have to run all the time with no downtime allowed. Consider, for example, systems controlling electric power plants and e-banking servers. Nevertheless, security patches and a constant stream of new operating system versions need to be deployed without stopping running programs. These factors naturally lead to a pressing demand for live update---upgrading all or parts of the operating system without rebooting. Unfortunately, existing solutions require significant manual intervention and thus work reliably only for small operating system patches. In this paper, we describe an automated system for live update that can safely and automatically handle major upgrades without rebooting. We have implemented our ideas in Proteos, a new research OS designed with live update in mind. Proteos relies on system support and nonintrusive instrumentation to handle even very complex updates with minimal manual effort. The key novelty is the idea of state quiescence, which allows updates to happen only in safe and predictable system states. A second novelty is the ability to automatically perform transactional live updates at the process level, ensuring a safe and stable update process. Unlike prior solutions, Proteos supports automated state transfer, state checking, and hot rollback. We have evaluated Proteos on 50 real updates and on novel live update scenarios. The results show that our techniques can effectively support both simple and complex updates, while outperforming prior solutions in terms of flexibility, security, reliability, and stability of the update process.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Mai:2013:VSI, author = "Haohui Mai and Edgar Pek and Hui Xue and Samuel Talmadge King and Parthasarathy Madhusudan", title = "Verifying security invariants in {ExpressOS}", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "293--304", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451148", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Security for applications running on mobile devices is important. In this paper we present ExpressOS, a new OS for enabling high-assurance applications to run on commodity mobile devices securely. Our main contributions are a new OS architecture and our use of formal methods for proving key security invariants about our implementation. In our use of formal methods, we focus solely on proving that our OS implements our security invariants correctly, rather than striving for full functional correctness, requiring significantly less verification effort while still proving the security relevant aspects of our system. We built ExpressOS, analyzed its security, and tested its performance. Our evaluation shows that the performance of ExpressOS is comparable to an Android-based system. In one test, we ran the same web browser on ExpressOS and on an Android-based system, and found that ExpressOS adds 16\% overhead on average to the page load latency time for nine popular web sites.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Schkufza:2013:SS, author = "Eric Schkufza and Rahul Sharma and Alex Aiken", title = "Stochastic superoptimization", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "305--316", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451150", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We formulate the loop-free binary superoptimization task as a stochastic search problem. The competing constraints of transformation correctness and performance improvement are encoded as terms in a cost function, and a Markov Chain Monte Carlo sampler is used to rapidly explore the space of all possible programs to find one that is an optimization of a given target program. Although our method sacrifices completeness, the scope of programs we are able to consider, and the resulting quality of the programs that we produce, far exceed those of existing superoptimizers. Beginning from binaries compiled by llvm -O0 for 64-bit x86, our prototype implementation, STOKE, is able to produce programs which either match or outperform the code produced by gcc -O3, icc -O3, and in some cases, expert handwritten assembly.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Schulte:2013:ARB, author = "Eric Schulte and Jonathan DiLorenzo and Westley Weimer and Stephanie Forrest", title = "Automated repair of binary and assembly programs for cooperating embedded devices", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "317--328", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451151", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We present a method for automatically repairing arbitrary software defects in embedded systems, which have limited memory, disk and CPU capacities, but exist in great numbers. We extend evolutionary computation (EC) algorithms that search for valid repairs at the source code level to assembly and ELF format binaries, compensating for limited system resources with several algorithmic innovations. Our method does not require access to the source code or build toolchain of the software under repair, does not require program instrumentation, specialized execution environments, or virtual machines, or prior knowledge of the bug type. We repair defects in ARM and x86 assembly as well as ELF binaries, observing decreases of 86\% in memory and 95\% in disk requirements, with 62\% decrease in repair time, compared to similar source-level techniques. These advances allow repairs previously possible only with C source code to be applied to any ARM or x86 assembly or ELF executable. Efficiency gains are achieved by introducing stochastic fault localization, with much lower overhead than comparable deterministic methods, and low-level program representations. When distributed over multiple devices, our algorithm finds repairs faster than predicted by naive parallelism. Four devices using our approach are five times more efficient than a single device because of our collaboration model. The algorithm is implemented on Nokia N900 smartphones, with inter-phone communication fitting in 900 bytes sent in 7 SMS text messages per device per repair on average.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Cui:2013:VSR, author = "Heming Cui and Gang Hu and Jingyue Wu and Junfeng Yang", title = "Verifying systems rules using rule-directed symbolic execution", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "329--342", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451152", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Systems code must obey many rules, such as ``opened files must be closed.'' One approach to verifying rules is static analysis, but this technique cannot infer precise runtime effects of code, often emitting many false positives. An alternative is symbolic execution, a technique that verifies program paths over all inputs up to a bounded size. However, when applied to verify rules, existing symbolic execution systems often blindly explore many redundant program paths while missing relevant ones that may contain bugs. Our key insight is that only a small portion of paths are relevant to rules, and the rest (majority) of paths are irrelevant and do not need to be verified. Based on this insight, we create WOODPECKER, a new symbolic execution system for effectively checking rules on systems programs. It provides a set of builtin checkers for common rules, and an interface for users to easily check new rules. It directs symbolic execution toward the program paths relevant to a checked rule, and soundly prunes redundant paths, exponentially speeding up symbolic execution. It is designed to be heuristic-agnostic, enabling users to leverage existing powerful search heuristics. Evaluation on 136 systems programs totaling 545K lines of code, including some of the most widely used programs, shows that, with a time limit of typically just one hour for each verification run, WOODPECKER effectively verifies 28.7\% of the program and rule combinations over bounded input, whereas an existing symbolic execution system KLEE verifies only 8.5\%. For the remaining combinations, WOODPECKER verifies 4.6 times as many relevant paths as KLEE. With a longer time limit, WOODPECKER verifies much more paths than KLEE, e.g., 17 times as many with a fourhour limit. WOODPECKER detects 113 rule violations, including 10 serious data loss errors with 2 most serious ones already confirmed by the corresponding developers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Xiang:2013:HHO, author = "Xiaoya Xiang and Chen Ding and Hao Luo and Bin Bao", title = "{HOTL}: a higher order theory of locality", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "343--356", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451153", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The locality metrics are many, for example, miss ratio to test performance, data footprint to manage cache sharing, and reuse distance to analyze and optimize a program. It is unclear how different metrics are related, whether one subsumes another, and what combination may represent locality completely. This paper first derives a set of formulas to convert between five locality metrics and gives the condition for correctness. The transformation is analogous to differentiation and integration used to convert between higher order polynomials. As a result, these metrics can be assigned an order and organized into a hierarchy. Using the new theory, the paper then develops two techniques: one measures the locality in real time without special hardware support, and the other predicts multicore cache interference without parallel testing. The paper evaluates them using sequential and parallel programs as well as for a parallel mix of sequential programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Kang:2013:HPP, author = "Hui Kang and Jennifer L. Wong", title = "To hardware prefetch or not to prefetch?: a virtualized environment study and core binding approach", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "357--368", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451155", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Most hardware and software vendors suggest disabling hardware prefetching in virtualized environments. They claim that prefetching is detrimental to application performance due to inaccurate prediction caused by workload diversity and VM interference on shared cache. However, no comprehensive or quantitative measurements to support this belief have been performed. This paper is the first to systematically measure the influence of hardware prefetching in virtualized environments. We examine a wide variety of benchmarks on three types of chip-multiprocessors (CMPs) to analyze the hardware prefetching performance. We conduct extensive experiments by taking into account a number of important virtualization factors. We find that hardware prefetching has minimal destructive influence under most configurations. Only with certain application combinations does prefetching influence the overall performance. To leverage these findings and make hardware prefetching effective across a diversity of virtualized environments, we propose a dynamic prefetching-aware VCPU-core binding approach (PAVCB), which includes two phases --- classifying and binding. The workload of each VM is classified into different cache sharing constraint categories based upon its cache access characteristics, considering both prefetch requests and demand requests. Then following heuristic rules, the VCPUs of each VM are scheduled onto appropriate cores subject to cache sharing constraints. We show that the proposed approach can improve performance by 12\% on average over the default scheduler and 46\% over manual system administrator bindings across different workload combinations in the presence of hardware prefetching.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Kim:2013:DBC, author = "Hwanju Kim and Sangwook Kim and Jinkyu Jeong and Joonwon Lee and Seungryoul Maeng", title = "Demand-based coordinated scheduling for {SMP VMs}", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "369--380", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451156", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As processor architectures have been enhancing their computing capacity by increasing core counts, independent workloads can be consolidated on a single node for the sake of high resource efficiency in data centers. With the prevalence of virtualization technology, each individual workload can be hosted on a virtual machine for strong isolation between co-located workloads. Along with this trend, hosted applications have increasingly been multithreaded to take advantage of improved hardware parallelism. Although the performance of many multithreaded applications highly depends on communication (or synchronization) latency, existing schemes of virtual machine scheduling do not explicitly coordinate virtual CPUs based on their communication behaviors. This paper presents a demand-based coordinated scheduling scheme for consolidated virtual machines that host multithreaded workloads. To this end, we propose communication-driven scheduling that controls time-sharing in response to inter-processor interrupts (IPIs) between virtual CPUs. On the basis of in-depth analysis on the relationship between IPI communications and coordination demands, we devise IPI-driven coscheduling and delayed preemption schemes, which effectively reduce synchronization latency and unnecessary CPU consumption. In addition, we introduce a load-conscious CPU allocation policy in order to address load imbalance in heterogeneously consolidated environments. The proposed schemes are evaluated with respect to various scenarios of mixed workloads using the PARSEC multithreaded applications. In the evaluation, our scheme improves the overall performance of consolidated workloads, especially communication-intensive applications, by reducing inefficient synchronization latency.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Dashti:2013:TMH, author = "Mohammad Dashti and Alexandra Fedorova and Justin Funston and Fabien Gaud and Renaud Lachaize and Baptiste Lepers and Vivien Quema and Mark Roth", title = "Traffic management: a holistic approach to memory placement on {NUMA} systems", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "381--394", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451157", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "NUMA systems are characterized by Non-Uniform Memory Access times, where accessing data in a remote node takes longer than a local access. NUMA hardware has been built since the late 80's, and the operating systems designed for it were optimized for access locality. They co-located memory pages with the threads that accessed them, so as to avoid the cost of remote accesses. Contrary to older systems, modern NUMA hardware has much smaller remote wire delays, and so remote access costs per se are not the main concern for performance, as we discovered in this work. Instead, congestion on memory controllers and interconnects, caused by memory traffic from data-intensive applications, hurts performance a lot more. Because of that, memory placement algorithms must be redesigned to target traffic congestion. This requires an arsenal of techniques that go beyond optimizing locality. In this paper we describe Carrefour, an algorithm that addresses this goal. We implemented Carrefour in Linux and obtained performance improvements of up to 3.6 relative to the default kernel, as well as significant improvements compared to NUMA-aware patchsets available for Linux. Carrefour never hurts performance by more than 4\% when memory placement cannot be improved. We present the design of Carrefour, the challenges of implementing it on modern hardware, and draw insights about hardware support that would help optimize system software on future NUMA systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Jog:2013:OCT, author = "Adwait Jog and Onur Kayiran and Nachiappan Chidambaram Nachiappan and Asit K. Mishra and Mahmut T. Kandemir and Onur Mutlu and Ravishankar Iyer and Chita R. Das", title = "{OWL}: cooperative thread array aware scheduling techniques for improving {GPGPU} performance", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "395--406", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451158", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Emerging GPGPU architectures, along with programming models like CUDA and OpenCL, offer a cost-effective platform for many applications by providing high thread level parallelism at lower energy budgets. Unfortunately, for many general-purpose applications, available hardware resources of a GPGPU are not efficiently utilized, leading to lost opportunity in improving performance. A major cause of this is the inefficiency of current warp scheduling policies in tolerating long memory latencies. In this paper, we identify that the scheduling decisions made by such policies are agnostic to thread-block, or cooperative thread array (CTA), behavior, and as a result inefficient. We present a coordinated CTA-aware scheduling policy that utilizes four schemes to minimize the impact of long memory latencies. The first two schemes, CTA-aware two-level warp scheduling and locality aware warp scheduling, enhance per-core performance by effectively reducing cache contention and improving latency hiding capability. The third scheme, bank-level parallelism aware warp scheduling, improves overall GPGPU performance by enhancing DRAM bank-level parallelism. The fourth scheme employs opportunistic memory-side prefetching to further enhance performance by taking advantage of open DRAM rows. Evaluations on a 28-core GPGPU platform with highly memory-intensive applications indicate that our proposed mechanism can provide 33\% average performance improvement compared to the commonly-employed round-robin warp scheduling policy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Pai:2013:IGC, author = "Sreepathi Pai and Matthew J. Thazhuthaveetil and R. Govindarajan", title = "Improving {GPGPU} concurrency with elastic kernels", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "407--418", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451160", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Each new generation of GPUs vastly increases the resources available to GPGPU programs. GPU programming models (like CUDA) were designed to scale to use these resources. However, we find that CUDA programs actually do not scale to utilize all available resources, with over 30\% of resources going unused on average for programs of the Parboil2 suite that we used in our work. Current GPUs therefore allow concurrent execution of kernels to improve utilization. In this work, we study concurrent execution of GPU kernels using multiprogram workloads on current NVIDIA Fermi GPUs. On two-program workloads from the Parboil2 benchmark suite we find concurrent execution is often no better than serialized execution. We identify that the lack of control over resource allocation to kernels is a major serialization bottleneck. We propose transformations that convert CUDA kernels into elastic kernels which permit fine-grained control over their resource usage. We then propose several elastic-kernel aware concurrency policies that offer significantly better performance and concurrency compared to the current CUDA policy. We evaluate our proposals on real hardware using multiprogrammed workloads constructed from benchmarks in the Parboil 2 suite. On average, our proposals increase system throughput (STP) by 1.21x and improve the average normalized turnaround time (ANTT) by 3.73x for two-program workloads when compared to the current CUDA concurrency implementation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Oh:2013:PAL, author = "Taewook Oh and Hanjun Kim and Nick P. Johnson and Jae W. Lee and David I. August", title = "Practical automatic loop specialization", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "419--430", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451161", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Program specialization optimizes a program with respect to program invariants, including known, fixed inputs. These invariants can be used to enable optimizations that are otherwise unsound. In many applications, a program input induces predictable patterns of values across loop iterations, yet existing specializers cannot fully capitalize on this opportunity. To address this limitation, we present Invariant-induced Pattern based Loop Specialization (IPLS), the first fully-automatic specialization technique designed for everyday use on real applications. Using dynamic information-flow tracking, IPLS profiles the values of instructions that depend solely on invariants and recognizes repeating patterns across multiple iterations of hot loops. IPLS then specializes these loops, using those patterns to predict values across a large window of loop iterations. This enables aggressive optimization of the loop; conceptually, this optimization reconstructs recurring patterns induced by the input as concrete loops in the specialized binary. IPLS specializes real-world programs that prior techniques fail to specialize without requiring hints from the user. Experiments demonstrate a geomean speedup of 14.1\% with a maximum speedup of 138\% over the original codes when evaluated on three script interpreters and eleven scripts each.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Phothilimthana:2013:PPH, author = "Phitchaya Mangpo Phothilimthana and Jason Ansel and Jonathan Ragan-Kelley and Saman Amarasinghe", title = "Portable performance on heterogeneous architectures", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "431--444", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451162", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Trends in both consumer and high performance computing are bringing not only more cores, but also increased heterogeneity among the computational resources within a single machine. In many machines, one of the greatest computational resources is now their graphics coprocessors (GPUs), not just their primary CPUs. But GPU programming and memory models differ dramatically from conventional CPUs, and the relative performance characteristics of the different processors vary widely between machines. Different processors within a system often perform best with different algorithms and memory usage patterns, and achieving the best overall performance may require mapping portions of programs across all types of resources in the machine. To address the problem of efficiently programming machines with increasingly heterogeneous computational resources, we propose a programming model in which the best mapping of programs to processors and memories is determined empirically. Programs define choices in how their individual algorithms may work, and the compiler generates further choices in how they can map to CPU and GPU processors and memory systems. These choices are given to an empirical autotuning framework that allows the space of possible implementations to be searched at installation time. The rich choice space allows the autotuner to construct poly-algorithms that combine many different algorithmic techniques, using both the CPU and the GPU, to obtain better performance than any one technique alone. Experimental results show that algorithmic changes, and the varied use of both CPUs and GPUs, are necessary to obtain up to a 16.5x speedup over using a single program configuration for all architectures.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Mittal:2013:EVE, author = "Aashish Mittal and Dushyant Bansal and Sorav Bansal and Varun Sethi", title = "Efficient virtualization on embedded {Power Architecture\reg} platforms", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "445--458", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451163", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Power Architecture\reg{} processors are popular and widespread on embedded systems, and such platforms are increasingly being used to run virtual machines. While the Power Architecture meets the Popek-and-Goldberg virtualization requirements for traditional trap-and-emulate style virtualization, the performance overhead of virtualization remains high. For example, workloads exhibiting a large amount of kernel activity typically show 3-5x slowdowns over bare-metal. Recent additions to the Linux kernel contain guest and host side paravirtual extensions for Power Architecture platforms. While these extensions improve performance significantly, they are guest-specific, guest-intrusive, and cover only a subset of all possible virtualization optimizations. We present a set of host-side optimizations that achieve comparable performance to the aforementioned paravirtual extensions, on an unmodified guest. Our optimizations are based on adaptive in-place binary translation. Unlike the paravirtual approach, our solution is guest neutral. We implement our ideas in a prototype based on Qemu/KVM. After our modifications, KVM can boot an unmodified Linux guest around 2.5x faster. We contrast our optimization approach with previous similar binary translation based approaches for the x86 architecture; in our experience, each architecture presents a unique set of challenges and optimization opportunities.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Hill:2013:RDC, author = "Mark D. Hill", title = "Research directions for 21st century computer systems: {ASPLOS 2013} panel", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "459--460", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451165", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Four recent efforts call out architectural challenges and opportunities up and down the software/hardware stack. This panel will discuss, ``What should the community do to facilitate, transcend, or refute these partially overlapping visions?'' The panel is chaired by Mark D. Hill with other panel members not finalized for the ASPLOS'13 proceedings.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Madhavapeddy:2013:ULO, author = "Anil Madhavapeddy and Richard Mortier and Charalampos Rotsos and David Scott and Balraj Singh and Thomas Gazagnaire and Steven Smith and Steven Hand and Jon Crowcroft", title = "Unikernels: library operating systems for the cloud", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "461--472", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451167", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We present unikernels, a new approach to deploying cloud services via applications written in high-level source code. Unikernels are single-purpose appliances that are compile-time specialised into standalone kernels, and sealed against modification when deployed to a cloud platform. In return they offer significant reduction in image sizes, improved efficiency and security, and should reduce operational costs. Our Mirage prototype compiles OCaml code into unikernels that run on commodity clouds and offer an order of magnitude reduction in code size without significant performance penalty. The architecture combines static type-safety with a single address-space layout that can be made immutable via a hypervisor extension. Mirage contributes a suite of type-safe protocol libraries, and our results demonstrate that the hypervisor is a platform that overcomes the hardware compatibility issues that have made past library operating systems impractical to deploy in the real-world.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Kadav:2013:FGF, author = "Asim Kadav and Matthew J. Renzelmann and Michael M. Swift", title = "Fine-grained fault tolerance using device checkpoints", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "473--484", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451168", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Recovering faults in drivers is difficult compared to other code because their state is spread across both memory and a device. Existing driver fault-tolerance mechanisms either restart the driver and discard its state, which can break applications, or require an extensive logging mechanism to replay requests and recreate driver state. Even logging may be insufficient, though, if the semantics of requests are ambiguous. In addition, these systems either require large subsystems that must be kept up-to-date as the kernel changes, or require substantial rewriting of drivers. We present a new driver fault-tolerance mechanism that provides fine-grained control over the code protected. Fine-Grained Fault Tolerance (FGFT) isolates driver code at the granularity of a single entry point. It executes driver code as a transaction, allowing roll back if the driver fails. We develop a novel checkpointing mechanism to save and restore device state using existing power management code. Unlike past systems, FGFT can be incrementally deployed in a single driver without the need for a large kernel subsystem, but at the cost of small modifications to the driver. In the evaluation, we show that FGFT can have almost zero runtime cost in many cases, and that checkpoint-based recovery can reduce the duration of a failure by 79\% compared to restarting the driver. Finally, we show that applying FGFT to a driver requires little effort, and the majority of drivers in common classes already contain the power-management code needed for checkpoint/restore.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Silberstein:2013:GIF, author = "Mark Silberstein and Bryan Ford and Idit Keidar and Emmett Witchel", title = "{GPUfs}: integrating a file system with {GPUs}", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "485--498", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451169", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "PU hardware is becoming increasingly general purpose, quickly outgrowing the traditional but constrained GPU-as-coprocessor programming model. To make GPUs easier to program and easier to integrate with existing systems, we propose making the host's file system directly accessible from GPU code. GPUfs provides a POSIX-like API for GPU programs, exploits GPU parallelism for efficiency, and optimizes GPU file access by extending the buffer cache into GPU memory. Our experiments, based on a set of real benchmarks adopted to use our file system, demonstrate the feasibility and benefits of our approach. For example, we demonstrate a simple self-contained GPU program which searches for a set of strings in the entire tree of Linux kernel source files over seven times faster than an eight-core CPU run.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Hunt:2013:DTN, author = "Nicholas Hunt and Tom Bergan and Luis Ceze and Steven D. Gribble", title = "{DDOS}: taming nondeterminism in distributed systems", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "499--508", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451170", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Nondeterminism complicates the development and management of distributed systems, and arises from two main sources: the local behavior of each individual node as well as the behavior of the network connecting them. Taming nondeterminism effectively requires dealing with both sources. This paper proposes DDOS, a system that leverages prior work on deterministic multithreading to offer: (1) space-efficient record/replay of distributed systems; and (2) fully deterministic distributed behavior. Leveraging deterministic behavior at each node makes outgoing messages strictly a function of explicit inputs. This allows us to record the system by logging just message's arrival time, not the contents. Going further, we propose and implement an algorithm that makes all communication between nodes deterministic by scheduling communication onto a global logical timeline. We implement both algorithms in a system called DDOS and evaluate our system with parallel scientific applications, an HTTP/memcached system and a distributed microbenchmark with a high volume of peer-to-peer communication. Our results show up to two orders of magnitude reduction in log size of record/replay, and that distributed systems can be made deterministic with an order of magnitude of overhead.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Wang:2013:TEH, author = "Cheng Wang and Youfeng Wu", title = "{TSO\_ATOMICITY}: efficient hardware primitive for {TSO}-preserving region optimizations", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "509--520", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451172", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Program optimizations based on data dependences may not preserve the memory consistency in the programs. Previous works leverage a hardware ATOMICITY primitive to restrict the thread interleaving for preserving sequential consistency in region optimizations. However, ATOMICITY primitive is over restrictive on the thread interleaving for optimizing real-world applications developed with the popular Total-Store-Ordering (TSO) memory consistency, which is weaker than sequential consistency. In this paper, we present a novel hardware TSO\_ATOMICITY primitive, which has less restriction on the thread interleaving than ATOMICITY primitive to permit more efficient program execution than ATOMICITY primitive, but can still preserve TSO memory consistency in all region optimizations. Furthermore, TSO_ATOMICITY primitive requires similar architecture support as ATOMICITY primitive and can be implemented with only slight change to the existing ATOMICITY primitive implementation. Our experimental results show that in a start-of-art dynamic binary optimization system on a large set of workloads, ATOMICITY primitive can only improve the performance by 4\% on average. TSO_ATOMICITY primitive can reduce the overhead associated with ATOMICITY primitive and improve the performance by 12\% on average.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Jafri:2013:WGI, author = "Syed Ali Raza Jafri and Gwendolyn Voskuilen and T. N. Vijaykumar", title = "{Wait-n-GoTM}: improving {HTM} performance by serializing cyclic dependencies", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "521--534", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451173", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Transactional memory (TM) has been proposed to alleviate some key programmability problems in chip multiprocessors. Most TMs optimistically allow concurrent transactions, detecting read-write or write-write conflicts. Upon conflicts, existing hardware TMs (HTMs) use one of three conflict-resolution policies: (1) always-abort, (2) always-wait for some conflicting transactions to complete, or (3) always-go past conflicts and resolve acyclic conflicts at commit or abort upon cyclic dependencies. While each policy has advantages, the policies degrade performance under contention by limiting concurrency (always-abort, always-wait) or incurring late aborts due to cyclic dependencies (always-go). Thus, while always-go avoids acyclic aborts, no policy avoids cyclic aborts. We propose Wait-n-GoTM (WnGTM) to increase concurrency while avoiding cyclic aborts. We observe that most cyclic dependencies are caused by threads interleaving multiple accesses to a few heavily-read-write-shared delinquent data cache blocks. These accesses occur in code sections called cycle inducer sections (CISTs). Accordingly, we propose Wait-n-Go (WnG) conflict-resolution to avoid many cyclic aborts by predicting and serializing the CISTs. To support the WnG policy, we extend previous HTMs to (1) allow multiple readers and writers, (2) scalably identify dependencies, and (3) detect cyclic dependencies via new mechanisms, namely, conflict transactional state, order-capture, and hardware timestamps, respectively. In 16-core simulations of STAMP, WnGTM achieves average speedups of 46\% for higher-contention benchmarks and 28\% for all benchmarks over always-abort (TokenTM) with low-contention benchmarks remaining unchanged, compared to always-go (DATM) and always-wait (LogTM-SE), which perform worse than and 6\% better than TokenTM, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Qian:2013:VSP, author = "Xuehai Qian and Josep Torrellas and Benjamin Sahelices and Depei Qian", title = "Volition: scalable and precise sequential consistency violation detection", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "535--548", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451174", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Sequential Consistency (SC) is the most intuitive memory model, and SC Violations (SCVs) produce unintuitive, typically incorrect executions. Most prior SCV detection schemes have used data races as proxies for SCVs, which is highly imprecise. Other schemes that have targeted data-race cycles are either too conservative or are designed only for two-processor cycles and snoopy-based systems. This paper presents Volition, the first hardware scheme that detects SCVs in a relaxed-consistency machine precisely, in a scalable manner, and for an arbitrary number of processors in the cycle. Volition leverages cache coherence protocol transactions to dynamically detect cycles in memory-access orders across threads. When a cycle is about to occur, an exception is triggered. Volition can be used in both directory- and snoopy-based coherence protocols. Our simulations of Volition in a 64-processor multicore with directory-based coherence running SPLASH-2 and Parsec programs shows that Volition induces negligible traffic and execution overhead. In addition, it can detect SCVs with several processors. Volition is suitable for on-the-fly use.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Grossman:2013:HSF, author = "J. P. Grossman and Jeffrey S. Kuskin and Joseph A. Bank and Michael Theobald and Ron O. Dror and Douglas J. Ierardi and Richard H. Larson and U. Ben Schafer and Brian Towles and Cliff Young and David E. Shaw", title = "Hardware support for fine-grained event-driven computation in {Anton 2}", journal = j-COMP-ARCH-NEWS, volume = "41", number = "1", pages = "549--560", month = mar, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490301.2451175", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:40:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Exploiting parallelism to accelerate a computation typically involves dividing it into many small tasks that can be assigned to different processing elements. An efficient execution schedule for these tasks can be difficult or impossible to determine in advance, however, if there is uncertainty as to when each task's input data will be available. Ideally, each task would run in direct response to the arrival of its input data, thus allowing the computation to proceed in a fine-grained event-driven manner. Realizing this ideal is difficult in practice, and typically requires sacrificing flexibility for performance. In Anton 2, a massively parallel special-purpose supercomputer for molecular dynamics simulations, we addressed this challenge by including a hardware block, called the dispatch unit, that provides flexible and efficient support for fine-grained event-driven computation. Its novel features include a many-to-many mapping from input data to a set of synchronization counters, and the ability to prioritize tasks based on their type. To solve the additional problem of using a fixed set of synchronization counters to track input data for a potentially large number of tasks, we created a software library that allows programmers to treat Anton 2 as an idealized machine with infinitely many synchronization counters. The dispatch unit, together with this library, made it possible to simplify our molecular dynamics software by expressing it as a collection of independent tasks, and the resulting fine-grained execution schedule improved overall performance by up to 16\% relative to a coarse-grained schedule for precisely the same computation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '13 conference proceedings.", } @Article{Sinha:2013:NRA, author = "Amitabha Sinha and Mitrava Sarkar and Soumojit Acharyya and Suranjan Chakraborty", title = "A novel reconfigurable architecture of a {DSP} processor for efficient mapping of {DSP} functions using field programmable {DSP} arrays", journal = j-COMP-ARCH-NEWS, volume = "41", number = "2", pages = "1--8", month = may, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490302.2490304", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sat Jun 1 11:00:26 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Development of modern integrated circuit technologies makes it feasible to develop cheaper, faster and smaller special purpose signal processing function circuits. Digital Signal processing functions are generally implemented either on ASICs with inflexibility, or on FPGAs with bottlenecks of relatively smaller utilization factor or lower speed compared to ASIC. Field Programmable DSP Array (FPDA) is the proposed DSP dedicated device, redolent to FPGA, but with basic fixed common modules (CMs) (like adders, subtractors, multipliers, scaling units, shifters) instead of CLBs. This paper introduces the development of reconfigurable system architecture with a focus on FPDA that integrates different DSP functions like DFT, FFT, DCT, FIR, IIR, and DWT etc. The switching between DSP functions is occurred by reconfiguring the interconnection between CMs. Validation of the proposed architecture has been achieved on Virtex5 FPGA. The architecture provides sufficient amount of flexibility, parallelism and scalability.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Saha:2013:PAF, author = "Amrita Saha and Manideepa Mukherjee and Debanjana Datta and Sangita Saha and Amitabha Sinha", title = "Performance analysis of a {FPGA} based novel binary and {DBNS} multiplier", journal = j-COMP-ARCH-NEWS, volume = "41", number = "2", pages = "9--16", month = may, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490302.2490305", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sat Jun 1 11:00:26 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Designing high performance Software Defined Radio (SDR) with low power and flexibility is a major challenge. While the high performance DSP processors are unable to meet the speed requirements of these SDRs, System on chips (SOCs) are also not suitable because of their limited flexibility. Recently dynamically reconfigurable FPGAs have emerged as high performance programmable hardware to execute highly parallel, computationally intensive signal processing functions efficiently. Since basic intention of an SDR is to implement different modulation / demodulation schemes and basic building blocks for such schemes are signal processing functions, FPGAs have become an important component for implementing these. However, the effectiveness of such an approach with respect to cost, performance and flexibility need to be examined. Double Base Number Systems (DBNS) have been gaining attention for compute intensive applications in signal processing because of their higher performance in arithmetic operations in general and particularly multiplication. Keeping these issues in view, this paper aims to present a new Software defined Radio. To Enhance the performance of the proposed architecture , analysis have been done employing both single index and multiple indices DBNS multipliers. Experiments and analysis on performance have also been done with its binary counterpart. Both DBNS and binary based architecture were implemented on Xilinx virtex iv FPGA using xilinx ISE 9.1 i.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sartin-Tarm:2013:CCS, author = "Michael Sartin-Tarm and Tony Nowatzki and Lorenzo {De Carli} and Karthikeyan Sankaralingam and Cristian Estan", title = "Constraint centric scheduling guide", journal = j-COMP-ARCH-NEWS, volume = "41", number = "2", pages = "17--21", month = may, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490302.2490306", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sat Jun 1 11:00:26 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The advent of architectures with software-exposed resources (Spatial Architectures) has created a demand for universally applicable scheduling techniques. This paper describes our generalized spatial scheduling framework, formulated with Integer Linear Programming, and specifically accomplishes two goals. First, using the ``Simple'' architecture, it illustrates how to use our open-source tool to create a customized scheduler and covers problem formulation with ILP and GAMS. Second, it summarizes results on the application to three real architectures (TRIPS,DySER,PLUG), demonstrating the technique's practicality and competitiveness with existing schedulers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Guha:2013:SEW, author = "Apala Guha and Yao Zhang and Raihan ur Rasool and Andrew A. Chien", title = "Systematic evaluation of workload clustering for extremely energy-efficient architectures", journal = j-COMP-ARCH-NEWS, volume = "41", number = "2", pages = "22--29", month = may, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490302.2490307", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sat Jun 1 11:00:26 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Chip power consumption has reached its limits, leading to the flattening of single-core performance. We propose the $10 \times 10$ processor, a federated heterogeneous multi-core architecture, where each core is an ensemble of u-engines (micro-engines, similar to accelerators) specialized for different workload groups to achieve dramatically higher energy efficiency. The u-engines collectively target the entire general-purpose workload space. The problem we study in this article is selecting the set of workloads that each u-engine should be customized for. For this problem we study the computation structure of a wide variety of workloads and cluster together workloads with similar computation structures, the idea being that each u-engine will be customized for the compute structures exhibited by a particular cluster. The constraint on this problem is the silicon budget of a processor. Lower silicon budgets accommodate fewer u-engines and require individual u-engines to target larger segments of the workload space which leads to lower energy efficiency benefits from customization, because there is more variation among the compute structures making up each cluster. Therefore, we also study how workload coverage and benefit can be maximized for a given silicon budget. We study a broad general-purpose workload that includes 34 codes from 6 benchmark suites, identifying the most frequent functions, and clustering them based on two sets of instruction usage features (high-resolution and low-resolution) into 8, 16, 32, 64, 128 clusters respectively. We develop abstract metrics (coverage and weighted customization benefit) to evaluate the clusters. We show significant potential payoffs with four benefit models: 2-3x (square root model), 4-10x (linear model), 12-24x (quadratic model), and 22-26x (cubic model).", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Saha:2013:IDP, author = "Amrita Saha and Pijush Biswas and Amitabha Sinha", title = "An integrated development platform of a reconfigurable radio processor for software defined radio", journal = j-COMP-ARCH-NEWS, volume = "41", number = "2", pages = "30--35", month = may, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490302.2490308", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sat Jun 1 11:00:26 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Performance required by ``Software Defined Radio (SDR)'' poses many challenges in real-time applications because of their high computational complexity. Designing a high performance SDR with a high degree of flexibility becomes an issue of importance. While the fastest programmable DSP processors are unable to meet the speed requirements for SDR, FPGAs also cannot offer the highest possible performance at the lowest silicon cost for a given signal processing function. Moreover, they are not optimized for radio applications because of their LUT based approach. To overcome the limitations of both DSP Processor and FPGAs, Radio Processor, a reconfigurable Processor optimized for Radio applications was conceived.[14],[17]. However, advantages of this Radio Processor cannot be made useful unless there is an integrated development environment to develop SDR. This paper addresses these issues by introducing a new Integrated Development platform for reconfigurable ``Radio Processor'' for implementing SDR.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pal:2013:FIN, author = "Santanu Pal and Amitabha Sinha and Pijush Biswas", title = "{FPGA} implementation of a novel {DCT} architecture reducing constant cosine terms", journal = j-COMP-ARCH-NEWS, volume = "41", number = "2", pages = "36--40", month = may, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490302.2490309", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sat Jun 1 11:00:26 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents a new scalable architecture for Discrete Cosine Transform (DCT). In contrast to the conventional DCT architecture, the proposed architecture reduces the number of constant cosine terms using the matrix transposition and symmetry property. This in turn, considerably reduces the computation time. The architecture is scalable and it can be extended to support any transform length. The architecture was validated on Xilinx Vertex-4 FPGA.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tseng:2013:NNE, author = "Kuo-Kun Tseng and Fu-Fu Zeng and Huang-Nan Huang and Yiming Liu and Jeng-Shyang Pan and W. H. Ip and C. H. Wu", title = "A new non-exact {Aho--Corasick} framework for {ECG} classification", journal = j-COMP-ARCH-NEWS, volume = "41", number = "2", pages = "41--46", month = may, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490302.2490310", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sat Jun 1 11:00:26 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The Aho--Corasick (AC) algorithm is a popular and useful exact string matching algorithm for text searching and deep packet inspection. However, it has seldom been used for non-exact classification or identification. We propose a novel framework to make use of AC for non-exact matching in the ECG identification. The AC classification (ACC) algorithm converts ECG waveforms into several short patterns for AC, and decides the identification result by AC matched counting value. In our experiments, the results are surprisingly good and superior to previous algorithms. So, we designed an AC algorithm application for non-exact classification with high accuracy. Meanwhile, ACC inherits the advantage from AC of being capable of handling a large pattern set with linear time complexity.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Maitra:2013:HPM, author = "Subhashis Maitra and Amitabha Sinha", title = "High performance {MAC} unit for {DSP} and cryptographic applications", journal = j-COMP-ARCH-NEWS, volume = "41", number = "2", pages = "47--55", month = may, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490302.2490311", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sat Jun 1 11:00:26 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Multiplication and addition are the basic arithmetic operation used in Digital Signal Processing (DSP) for coefficient multiplication, scalar point multiplication in Elliptic Curve Cryptography (ECC) and in other fields. Multiplications are basically a shift and add operation. However, there are many different variations on how to do it. Some are more suitable to implement on FPGA than others. However time complexities and hardware complexities are the major issues in designing a multiplier unit. There are different multiplication algorithms in current technology. Hardware complexities in some design are more than time complexities whereas in some other design time complexities are more. However there must be a tradeoff between these two types of methodology. This paper will discuss a brief idea how a tradeoff can be achieved. Experimental results that have discussed here and the architecture based on the proposed algorithm shows it's novelty. Applications of the proposed algorithm on DSP and ECC have been dealt here clearly.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2013:INa, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "41", number = "2", pages = "56--71", month = may, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2490302.2490313", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sat Jun 1 11:00:26 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Belhadj:2013:CRW, author = "Bilel Belhadj and Antoine Joubert and Zheng Li and Rodolphe H{\'e}liot and Olivier Temam", title = "Continuous real-world inputs can open up alternative accelerator designs", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "1--12", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485923", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Motivated by energy constraints, future heterogeneous multi-cores may contain a variety of accelerators, each targeting a subset of the application spectrum. Beyond energy, the growing number of faults steers accelerator research towards fault-tolerant accelerators. In this article, we investigate a fault-tolerant and energy-efficient accelerator for signal processing applications. We depart from traditional designs by introducing an accelerator which relies on unary coding, a concept which is well adapted to the continuous real-world inputs of signal processing applications. Unary coding enables a number of atypical micro-architecture choices which bring down area cost and energy; moreover, unary coding provides graceful output degradation as the amount of transient faults increases. We introduce a configurable hybrid digital/analog micro-architecture capable of implementing a broad set of signal processing applications based on these concepts, together with a back-end optimizer which takes advantage of the special nature of these applications. For a set of five signal applications, we explore the different design tradeoffs and obtain an accelerator with an area cost of 1.63 mm$^2$. On average, this accelerator requires only 2.3\% of the energy of an Atom-like core to implement similar tasks. We then evaluate the accelerator resilience to transient faults, and its ability to trade accuracy for energy savings.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Petrica:2013:FDA, author = "Paula Petrica and Adam M. Izraelevitz and David H. Albonesi and Christine A. Shoemaker", title = "{Flicker}: a dynamically adaptive architecture for power limited multicore systems", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "13--23", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485924", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Future microprocessors may become so power constrained that not all transistors will be able to be powered on at once. These systems will be required to nimbly adapt to changes in the chip power that is allocated to general-purpose cores and to specialized accelerators. This paper presents Flicker, a general-purpose multicore architecture that dynamically adapts to varying and potentially stringent limits on allocated power. The Flicker core microarchitecture includes deconfigurable lanes --- horizontal slices through the pipeline --- that permit tailoring an individual core to the running application with lower overhead than microarchitecture-level adaptation, and greater flexibility than core-level power gating. To exploit Flicker's flexible pipeline architecture, a new online multicore optimization algorithm combines reduced sampling techniques, application of response surface models to online optimization, and heuristic online search. The approach efficiently finds a near-global-optimum configuration of lanes without requiring offline training, microarchitecture state, or foreknowledge of the workload. At high power allocations, core-level gating is highly effective, and slightly outperforms Flicker overall. However, under stringent power constraints, Flicker significantly outperforms core-level gating, achieving an average 27\% performance improvement.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Qadeer:2013:CEB, author = "Wajahat Qadeer and Rehan Hameed and Ofer Shacham and Preethi Venkatesan and Christos Kozyrakis and Mark A. Horowitz", title = "Convolution engine: balancing efficiency \& flexibility in specialized computing", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "24--35", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485925", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "This paper focuses on the trade-off between flexibility and efficiency in specialized computing. We observe that specialized units achieve most of their efficiency gains by tuning data storage and compute structures and their connectivity to the data-flow and data-locality patterns in the kernels. Hence, by identifying key data-flow patterns used in a domain, we can create efficient engines that can be programmed and reused across a wide range of applications. We present an example, the Convolution Engine (CE), specialized for the convolution-like data-flow that is common in computational photography, image processing, and video processing applications. CE achieves energy efficiency by capturing data reuse patterns, eliminating data transfer overheads, and enabling a large number of operations per memory access. We quantify the tradeoffs in efficiency and flexibility and demonstrate that CE is within a factor of 2-3x of the energy and area efficiency of custom units optimized for a single kernel. CE improves energy and area efficiency by 8-15x over a SIMD engine for most applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lim:2013:TSS, author = "Kevin Lim and David Meisner and Ali G. Saidi and Parthasarathy Ranganathan and Thomas F. Wenisch", title = "Thin servers with smart pipes: designing {SoC} accelerators for memcached", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "36--47", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485926", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Distributed in-memory key-value stores, such as memcached, are central to the scalability of modern internet services. Current deployments use commodity servers with high-end processors. However, given the cost-sensitivity of internet services and the recent proliferation of volume low-power System-on-Chip (SoC) designs, we see an opportunity for alternative architectures. We undertake a detailed characterization of memcached to reveal performance and power inefficiencies. Our study considers both high-performance and low-power CPUs and NICs across a variety of carefully-designed benchmarks that exercise the range of memcached behavior. We discover that, regardless of CPU microarchitecture, memcached execution is remarkably inefficient, saturating neither network links nor available memory bandwidth. Instead, we find performance is typically limited by the per-packet processing overheads in the NIC and OS kernel --- long code paths limit CPU performance due to poor branch predictability and instruction fetch bottlenecks. Our insights suggest that neither high-performance nor low-power cores provide a satisfactory power-performance trade-off, and point to a need for tighter integration of the network interface. Hence, we argue for an alternate architecture --- Thin Servers with Smart Pipes (TSSP) --- for cost-effective high-performance memcached deployment. TSSP couples an embedded-class low-power core to a memcached accelerator that can process GET requests entirely in hardware, offloading both network handling and data look up. We demonstrate the potential benefits of our TSSP architecture through an FPGA prototyping platform, and show the potential for a 6x--16x power-performance improvement over conventional server baselines.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mukundan:2013:UMR, author = "Janani Mukundan and Hillery Hunter and Kyu-hyoun Kim and Jeffrey Stuecheli and Jos{\'e} F. Mart{\'\i}nez", title = "Understanding and mitigating refresh overheads in high-density {DDR4 DRAM} systems", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "48--59", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485927", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Recent DRAM specifications exhibit increasing refresh latencies. A refresh command blocks a full rank, decreasing available parallelism in the memory subsystem significantly, thus decreasing performance. Fine Granularity Refresh (FGR) is a feature recently announced as part of JEDEC's DDR4 DRAM specification that attempts to tackle this problem by creating a range of refresh options that provide a trade-off between refresh latency and frequency. In this paper, we first conduct an analysis of DDR4 DRAM's FGR feature, and show that there is no one-size-fits-all option across a variety of applications. We then present Adaptive Refresh (AR), a simple yet effective mechanism that dynamically chooses the best FGR mode for each application and phase within the application. When looking at the refresh problem more closely, we identify in high-density DRAM systems a phenomenon that we call command queue seizure, whereby the memory controller's command queue seizes up temporarily because it is full with commands to a rank that is being refreshed. To attack this problem, we propose two complementary mechanisms called Delayed Command Expansion (DCE) and Preemptive Command Drain (PCD). Our results show that AR does exploit DDR4's FGR effectively. However, once our proposed DCE and PCD mechanisms are added, DDR4's FGR becomes redundant in most cases, except in a few highly memory-sensitive applications, where the use of AR does provide some additional benefit. In all, our simulations show that the proposed mechanisms yield 8\% (14\%) mean speedup with respect to traditional refresh, at normal (extended) DRAM operating temperatures, for a set of diverse parallel applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Liu:2013:ESD, author = "Jamie Liu and Ben Jaiyen and Yoongu Kim and Chris Wilkerson and Onur Mutlu", title = "An experimental study of data retention behavior in modern {DRAM} devices: implications for retention time profiling mechanisms", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "60--71", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485928", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "DRAM cells store data in the form of charge on a capacitor. This charge leaks off over time, eventually causing data to be lost. To prevent this data loss from occurring, DRAM cells must be periodically refreshed. Unfortunately, DRAM refresh operations waste energy and also degrade system performance by interfering with memory requests. These problems are expected to worsen as DRAM density increases. The amount of time that a DRAM cell can safely retain data without being refreshed is called the cell's retention time. In current systems, all DRAM cells are refreshed at the rate required to guarantee the integrity of the cell with the shortest retention time, resulting in unnecessary refreshes for cells with longer retention times. Prior work has proposed to reduce unnecessary refreshes by exploiting differences in retention time among DRAM cells; however, such mechanisms require knowledge of each cell's retention time. In this paper, we present a comprehensive quantitative study of retention behavior in modern DRAMs. Using a temperature-controlled FPGA-based testing platform, we collect retention time information from 248 commodity DDR3 DRAM chips from five major DRAM vendors. We observe two significant phenomena: data pattern dependence, where the retention time of each DRAM cell is significantly affected by the data stored in other DRAM cells, and variable retention time, where the retention time of some DRAM cells changes unpredictably over time. We discuss possible physical explanations for these phenomena, how their magnitude may be affected by DRAM technology scaling, and their ramifications for DRAM retention time profiling mechanisms.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nair:2013:AAF, author = "Prashant J. Nair and Dae-Hyun Kim and Moinuddin K. Qureshi", title = "{ArchShield}: architectural framework for assisting {DRAM} scaling by tolerating high error rates", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "72--83", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485929", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "DRAM scaling has been the prime driver for increasing the capacity of main memory system over the past three decades. Unfortunately, scaling DRAM to smaller technology nodes has become challenging due to the inherent difficulty in designing smaller geometries, coupled with the problems of device variation and leakage. Future DRAM devices are likely to experience significantly high error-rates. Techniques that can tolerate errors efficiently can enable DRAM to scale to smaller technology nodes. However, existing techniques such as row/column sparing and ECC become prohibitive at high error-rates. To develop cost-effective solutions for tolerating high error-rates, this paper advocates a cross-layer approach. Rather than hiding the faulty cell information within the DRAM chips, we expose it to the architectural level. We propose ArchShield, an architectural framework that employs runtime testing to identify faulty DRAM cells. ArchShield tolerates these faults using two components, a Fault Map that keeps information about faulty words in a cache line, and Selective Word-Level Replication (SWLR) that replicates faulty words for error resilience. Both Fault Map and SWLR are integrated in reserved area in DRAM memory. Our evaluations with 8GB DRAM DIMM show that ArchShield can efficiently tolerate error-rates as higher as 10$^{-4}$ (100x higher than ECC alone), causes less than 2\% performance degradation, and still maintains 1-bit error tolerance against soft errors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ghose:2013:IMS, author = "Saugata Ghose and Hyodong Lee and Jos{\'e} F. Mart{\'\i}nez", title = "Improving memory scheduling via processor-side load criticality information", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "84--95", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485930", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "We hypothesize that performing processor-side analysis of load instructions, and providing this pre-digested information to memory schedulers judiciously, can increase the sophistication of memory decisions while maintaining a lean memory controller that can take scheduling actions quickly. This is increasingly important as DRAM frequencies continue to increase relative to processor speed. In this paper we propose one such mechanism, pairing up a processor-side load criticality predictor with a lean memory controller that prioritizes load requests based on ranking information supplied from the processor side. Using a sophisticated multi-core simulator that includes a detailed quad-channel DDR3 DRAM model, we demonstrate that this mechanism can improve performance significantly on a CMP, with minimal overhead and virtually no changes to the processor itself. We show that our design compares favorably to several state-of-the-art schedulers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Isci:2013:AEV, author = "Canturk Isci and Suzanne McIntosh and Jeffrey Kephart and Rajarshi Das and James Hanson and Scott Piper and Robert Wolford and Thomas Brey and Robert Kantner and Allen Ng and James Norris and Abdoulaye Traore and Michael Frissora", title = "Agile, efficient virtualization power management with low-latency server power states", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "96--107", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485931", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", note = "ICSA '13 conference proceedings.", abstract = "One of the main driving forces of the growing adoption of virtualization is its dramatic simplification of the provisioning and dynamic management of IT resources. By decoupling running entities from the underlying physical resources, and by providing easy-to-use controls to allocate, deallocate and migrate virtual machines (VMs) across physical boundaries, virtualization opens up new opportunities for improving overall system resource use and power efficiency. While a range of techniques for dynamic, distributed resource management of virtualized systems have been proposed and have seen their widespread adoption in enterprise systems, similar techniques for dynamic power management have seen limited acceptance. The main barrier to dynamic, power-aware virtualization management stems not from the limitations of virtualization, but rather from the underlying physical systems; and in particular, the high latency and energy cost of power state change actions suited for virtualization power management. In this work, we first explore the feasibility of low-latency power states for enterprise server systems and demonstrate, with real prototypes, their quantitative energy-performance trade offs compared to traditional server power states. Then, we demonstrate an end-to-end power-aware virtualization management solution leveraging these states, and evaluate the dramatically-favorable power-performance characteristics achievable with such systems. We present, via both real system implementations and scale-out simulations, that virtualization power management with low-latency server power states can achieve comparable overheads as base distributed resource management in virtualized systems, and thus can benefit from the same level of adoption, while delivering close to energy-proportional power efficiency.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tu:2013:SDS, author = "Cheng-Chun Tu and Chao-tang Lee and Tzi-cker Chiueh", title = "Secure {I/O} device sharing among virtual machines on multiple hosts", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "108--119", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485932", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", note = "ICSA '13 conference proceedings.", abstract = "Virtualization allows flexible mappings between physical resources and virtual entities, and improves allocation efficiency and agility. Unfortunately, most existing virtualization technologies are limited to resources in a single host. This paper presents the design, implementation and evaluation of a multi-host I/O device virtualization system called Ladon, which enables I/O devices to be shared among virtual machines running on multiple hosts in a secure and efficient way. Specifically, Ladon uses a PCIe network to connect multiple servers with PCIe devices and allows VMs running on these servers to directly interact with these PCIe devices without interfering with one another. Through an evaluation of a fully operational Ladon prototype, we show that there is no throughput and latency penalty of the multi-host I/O virtualization enabled by Ladon compared to those of the existing single-host I/O virtualization technology.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chang:2013:IVP, author = "Xiaotao Chang and Hubertus Franke and Yi Ge and Tao Liu and Kun Wang and Jimi Xenidis and Fei Chen and Yu Zhang", title = "Improving virtualization in the presence of software managed translation lookaside buffers", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "120--129", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485933", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", note = "ICSA '13 conference proceedings.", abstract = "Virtualization has become an important technology that is used across many platforms, particularly servers, to increase utilization, multi-tenancy and security. Virtualization introduces additional overhead that often relates to memory management, interrupt handling and hypervisor mode switching. Among those, memory management and translation lookaside buffer (TLB) management have been shown to have a significant impact on the performance of systems. Two principal mechanisms for TLB management exist in today's systems, namely software and hardware managed TLBs. In this paper, we analyze and quantify the overhead of a pure software virtualization that is implemented over a software managed TLB. We then describe our design of hardware extensions to support virtualization in systems with software managed TLBs to remove the most dominant overheads. These extensions were implemented in the Power embedded A2 core, which is used in the PowerEN and in the Blue Gene/Q processors. They were used to implement a KVM port. We evaluate each of these hardware extensions to determine their overall contributions to performance and efficiency. Collectively these extensions demonstrate an average improvement of 232\% over a pure software implementation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kim:2013:MME, author = "Ji Kim and Christopher Torng and Shreesha Srinath and Derek Lockhart and Christopher Batten", title = "Microarchitectural mechanisms to exploit value structure in {SIMT} architectures", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "130--141", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485934", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "SIMT architectures improve performance and efficiency by exploiting control and memory-access structure across data-parallel threads. Value structure occurs when multiple threads operate on values that can be compactly encoded, e.g., by using a simple function of the thread index. We characterize the availability of control, memory-access, and value structure in typical kernels and observe ample amounts of value structure that is largely ignored by current SIMT architectures. We propose three microarchitectural mechanisms to exploit value structure based on compact affine execution of arithmetic, branch, and memory instructions. We explore these mechanisms within the context of traditional SIMT microarchitectures (GP-SIMT), found in general-purpose graphics processing units, as well as fine-grain SIMT microarchitectures (FG-SIMT), a SIMT variant appropriate for compute-focused data-parallel accelerators. Cycle-level modeling of a modern GP-SIMT system and a VLSI implementation of an eight-lane FG-SIMT execution engine are used to evaluate a range of application kernels. When compared to a baseline without compact affine execution, our approach can improve GP-SIMT cycle-level performance by 4-17\% and can improve FG-SIMT absolute performance by 20-65\% and energy efficiency up to 30\% for a majority of the kernels.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Parashar:2013:TIC, author = "Angshuman Parashar and Michael Pellauer and Michael Adler and Bushra Ahsan and Neal Crago and Daniel Lustig and Vladimir Pavlov and Antonia Zhai and Mohit Gambhir and Aamer Jaleel and Randy Allmon and Rachid Rayess and Stephen Maresh and Joel Emer", title = "Triggered instructions: a control paradigm for spatially-programmed architectures", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "142--153", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485935", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "In this paper, we present triggered instructions, a novel control paradigm for arrays of processing elements (PEs) aimed at exploiting spatial parallelism. Triggered instructions completely eliminate the program counter and allow programs to transition concisely between states without explicit branch instructions. They also allow efficient reactivity to inter-PE communication traffic. The approach provides a unified mechanism to avoid over-serialized execution, essentially achieving the effect of techniques such as dynamic instruction reordering and multithreading, which each require distinct hardware mechanisms in a traditional sequential architecture. Our analysis shows that a triggered-instruction based spatial accelerator can achieve 8X greater area-normalized performance than a traditional general-purpose processor. Further analysis shows that triggered control reduces the number of static and dynamic instructions in the critical paths by 62\% and 64\% respectively over a program-counter style spatial baseline, resulting in a speedup of 2.0X.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Joao:2013:UBA, author = "Jos{\'e} A. Joao and M. Aater Suleman and Onur Mutlu and Yale N. Patt", title = "Utility-based acceleration of multithreaded applications on asymmetric {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "154--165", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485936", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Asymmetric Chip Multiprocessors (ACMPs) are becoming a reality. ACMPs can speed up parallel applications if they can identify and accelerate code segments that are critical for performance. Proposals already exist for using coarse-grained thread scheduling and fine-grained bottleneck acceleration. Unfortunately, there have been no proposals offered thus far to decide which code segments to accelerate in cases where both coarse-grained thread scheduling and fine-grained bottleneck acceleration could have value. This paper proposes Utility-Based Acceleration of Multithreaded Applications on Asymmetric CMPs (UBA), a cooperative software/hardware mechanism for identifying and accelerating the most likely critical code segments from a set of multithreaded applications running on an ACMP. The key idea is a new Utility of Acceleration metric that quantifies the performance benefit of accelerating a bottleneck or a thread by taking into account both the criticality and the expected speedup. UBA outperforms the best of two state-of-the-art mechanisms by 11\% for single application workloads and by 7\% for two-application workloads on an ACMP with 52 small cores and 3 large cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kudrow:2013:QRC, author = "Daniel Kudrow and Kenneth Bier and Zhaoxia Deng and Diana Franklin and Yu Tomita and Kenneth R. Brown and Frederic T. Chong", title = "Quantum rotations: a case study in static and dynamic machine-code generation for quantum computers", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "166--176", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485937", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Work in quantum computer architecture has focused on communication, layout and fault tolerance, largely driven by Shor's factorization algorithm. For the first time, we study a larger range of benchmarks and find that another critical issue is the generation of code sequences for quantum rotation operations. Specifically, quantum algorithms require arbitrary rotation angles, while quantum technologies and error correction codes provide only for discrete angles and operators. A sequence of quantum machine instructions must be generated to approximate the arbitrary rotation to the required precision. While previous work has focused exclusively on static compilation, we find that some applications require dynamic code generation and explore the advantages and disadvantages of static and dynamic approaches. We find that static code generation can, in some cases, lead to a terabyte of machine code to support required rotations. We also find that some rotation angles are unknown until run time, requiring dynamic code generation. Dynamic code generation, however, exhibits significant trade-offs in terms of time overhead versus code size. Furthermore, dynamic code generation will be performed on classical (non-quantum) computing resources, which may or may not have a clock speed advantage over the target quantum technology. For example, operations on trapped ions run at kilohertz speeds, but superconducting qubits run at gigahertz speeds. We introduce a new method for compiling arbitrary rotations dynamically, designed to minimize compilation time. The new method reduces compilation time by up to five orders of magnitude while increasing code size by one order of magnitude. We explore the design space formed by these trade-offs of dynamic versus static code generation, code quality, and quantum technology. We introduce several techniques to provide smoother trade-offs for dynamic code generation and evaluate the viability of options in the design space.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Muscat:2013:DBM, author = "Richard A. Muscat and Karin Strauss and Luis Ceze and Georg Seelig", title = "{DNA}-based molecular architecture with spatially localized components", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "177--188", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485938", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Performing computation inside living cells offers life-changing applications, from improved medical diagnostics to better cancer therapy to intelligent drugs. Due to its bio-compatibility and ease of engineering, one promising approach for performing in-vivo computation is DNA strand displacement. This paper introduces computer architects to DNA strand displacement ``circuits'', discusses associated architectural challenges, and proposes a new organization that provides practical composability. In particular, prior approaches rely mostly on stochastic interaction of freely diffusing components. This paper proposes practical spatial isolation of components, leading to more easily designed DNA-based circuits. DNA nanotechnology is currently at a turning point, with many proposed applications being realized [20, 9]. We believe that it is time for the computer architecture community to take notice and contribute.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Guo:2013:ADA, author = "Qing Guo and Xiaochen Guo and Ravi Patel and Engin Ipek and Eby G. Friedman", title = "{AC-DIMM}: associative computing with {STT-MRAM}", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "189--200", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485939", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "With technology scaling, on-chip power dissipation and off-chip memory bandwidth have become significant performance bottlenecks in virtually all computer systems, from mobile devices to supercomputers. An effective way of improving performance in the face of bandwidth and power limitations is to rely on associative memory systems. Recent work on a PCM-based, associative TCAM accelerator shows that associative search capability can reduce both off-chip bandwidth demand and overall system energy. Unfortunately, previously proposed resistive TCAM accelerators have limited flexibility: only a restricted (albeit important) class of applications can benefit from a TCAM accelerator, and the implementation is confined to resistive memory technologies with a high dynamic range ( {R$_{High}$} /{R$_{Low}$} ), such as PCM. This work proposes AC-DIMM, a flexible, high-performance associative compute engine built on a DDR3-compatible memory module. AC-DIMM addresses the limited flexibility of previous resistive TCAM accelerators by combining two powerful capabilities --- associative search and processing in memory. Generality is improved by augmenting a TCAM system with a set of integrated, user programmable microcontrollers that operate directly on search results, and by architecting the system such that key-value pairs can be co-located in the same TCAM row. A new, bit-serial TCAM array is proposed, which enables the system to be implemented using STT-MRAM. AC-DIMM achieves a 4.2X speedup and a 6.5X energy reduction over a conventional RAM-based system on a set of 13 evaluated applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hechtman:2013:EMC, author = "Blake A. Hechtman and Daniel J. Sorin", title = "Exploring memory consistency for massively-threaded throughput-oriented processors", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "201--212", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485940", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "We re-visit the issue of hardware consistency models in the new context of massively-threaded throughput-oriented processors (MTTOPs). A prominent example of an MTTOP is a GPGPU, but other examples include Intel's MIC architecture and some recent academic designs. MTTOPs differ from CPUs in many significant ways, including their ability to tolerate latency, their memory system organization, and the characteristics of the software they run. We compare implementations of various hardware consistency models for MTTOPs in terms of performance, energy-efficiency, hardware complexity, and programmability. Our results show that the choice of hardware consistency model has a surprisingly minimal impact on performance and thus the decision should be based on hardware complexity, energy-efficiency, and programmability. For many MTTOPs, it is likely that even a simple implementation of sequential consistency is attractive.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Duan:2013:WTM, author = "Yuelu Duan and Abdullah Muzahid and Josep Torrellas", title = "{WeeFence}: toward making fences free in {TSO}", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "213--224", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485941", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Although fences are designed for low-overhead concurrency coordination, they can be expensive in current machines. If fences were largely free, faster fine-grained concurrent algorithms could be devised, and compilers could guarantee Sequential Consistency (SC) at little cost. In this paper, we present WeeFence (or WFence for short), a fence that is very cheap because it allows post-fence accesses to skip it. Such accesses can typically complete and retire before the pre-fence writes have drained from the write buffer. Only when an incorrect reordering of accesses is about to happen, does the hardware stall to prevent it. In the paper, we present the WFence design for TSO, and compare it to a conventional fence with speculation for 8-processor multicore simulations. We run parallel kernels that contain explicit fences and parallel applications that do not. For the kernels, WFence eliminates nearly all of the fence stall, reducing the kernels' execution time by an average of 11\%. For the applications, a conservative compiler algorithm places fences in the code to guarantee SC. In this case, on average, WFences reduce the resulting fence overhead from 38\% of the applications' execution time to 2\% (in a centralized WFence design), or from 36\% to 5\% (in a distributed WFence design).", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cain:2013:RAS, author = "Harold W. Cain and Maged M. Michael and Brad Frey and Cathy May and Derek Williams and Hung Le", title = "Robust architectural support for transactional memory in the {Power} architecture", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "225--236", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485942", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "On the twentieth anniversary of the original publication [10], following ten years of intense activity in the research literature, hardware support for transactional memory (TM) has finally become a commercial reality, with HTM-enabled chips currently or soon-to-be available from many hardware vendors. In this paper we describe architectural support for TM added to a future version of the Power ISA{\TM}. Two imperatives drove the development: the desire to complement our weakly-consistent memory model with a more friendly interface to simplify the development and porting of multithreaded applications, and the need for robustness beyond that of some early implementations. In the process of commercializing the feature, we had to resolve some previously unexplored interactions between TM and existing features of the ISA, for example translation shootdown, interrupt handling, atomic read-modify-write primitives, and our weakly consistent memory model. We describe these interactions, the overall architecture, and discuss the motivation and rationale for our choices of architectural semantics, beyond what is typically found in reference manuals.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Basu:2013:EVM, author = "Arkaprava Basu and Jayneel Gandhi and Jichuan Chang and Mark D. Hill and Michael M. Swift", title = "Efficient virtual memory for big memory servers", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "237--248", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485943", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Our analysis shows that many ``big-memory'' server workloads, such as databases, in-memory caches, and graph analytics, pay a high cost for page-based virtual memory. They consume as much as 10\% of execution cycles on TLB misses, even using large pages. On the other hand, we find that these workloads use read-write permission on most pages, are provisioned not to swap, and rarely benefit from the full flexibility of page-based virtual memory. To remove the TLB miss overhead for big-memory workloads, we propose mapping part of a process's linear virtual address space with a direct segment, while page mapping the rest of the virtual address space. Direct segments use minimal hardware --- base, limit and offset registers per core --- to map contiguous virtual memory regions directly to contiguous physical memory. They eliminate the possibility of TLB misses for key data structures such as database buffer pools and in-memory key-value stores. Memory mapped by a direct segment may be converted back to paging when needed. We prototype direct-segment software support for x86-64 in Linux and emulate direct-segment hardware. For our workloads, direct segments eliminate almost all TLB misses and reduce the execution time wasted on TLB misses to less than 0.5\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wu:2013:NBD, author = "Lisa Wu and Raymond J. Barker and Martha A. Kim and Kenneth A. Ross", title = "Navigating big data with high-throughput, energy-efficient data partitioning", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "249--260", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485944", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "The global pool of data is growing at 2.5 quintillion bytes per day, with 90\% of it produced in the last two years alone [24]. There is no doubt the era of big data has arrived. This paper explores targeted deployment of hardware accelerators to improve the throughput and energy efficiency of large-scale data processing. In particular, data partitioning is a critical operation for manipulating large data sets. It is often the limiting factor in database performance and represents a significant fraction of the overall runtime of large data queries. To accelerate partitioning, this paper describes a hardware accelerator for range partitioning, or HARP, and a hardware-software data streaming framework. The streaming framework offers a seamless execution environment for streaming accelerators such as HARP. Together, HARP and the streaming framework provide an order of magnitude improvement in partitioning performance and energy. A detailed analysis of a 32 nm physical design shows 7.8 times the throughput of a highly optimized and optimistic software implementation, while consuming just 6.9\% of the area and 4.3\% of the power of a single Xeon core in the same technology generation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chung:2013:LBD, author = "Eric S. Chung and John D. Davis and Jaewon Lee", title = "{LINQits}: big data on little clients", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "261--272", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485945", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "We present LINQits, a flexible hardware template that can be mapped onto programmable logic or ASICs in a heterogeneous system-on-chip for a mobile device or server. Unlike fixed-function accelerators, LINQits accelerates a domain-specific query language called LINQ. LINQits does not provide coverage for all possible applications --- however, existing applications (re-)written with LINQ in mind benefit extensively from hardware acceleration. Furthermore, the LINQits framework offers a graceful and transparent migration path from software to hardware. LINQits is prototyped on a 2W heterogeneous SoC called the ZYNQ processor, which combines dual ARM A9 processors with an FPGA on a single die in 28nm silicon technology. Our physical measurements show that LINQits improves energy efficiency by 8.9 to 30.6 times and performance by 10.7 to 38.1 times compared to optimized, multithreaded C programs running on conventional ARM A9 processors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Atta:2013:SBI, author = "Islam Atta and Pinar T{\"o}z{\"u}n and Xin Tong and Anastasia Ailamaki and Andreas Moshovos", title = "{STREX}: boosting instruction cache reuse in {OLTP} workloads through stratified transaction execution", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "273--284", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485946", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Online transaction processing (OLTP) workload performance suffers from instruction stalls; the instruction footprint of a typical transaction exceeds by far the capacity of an L1 cache, leading to ongoing cache thrashing. Several proposed techniques remove some instruction stalls in exchange for error-prone instrumentation to the code base, or a sharp increase in the L1-I cache unit area and power. Others reduce instruction miss latency by better utilizing a shared L2 cache. SLICC [2], a recently proposed thread migration technique that exploits transaction instruction locality, is promising for high core counts but performs sub-optimally or may hurt performance when running on few cores. This paper corroborates that OLTP transactions exhibit significant intra- and inter-thread overlap in their instruction footprint, and analyzes the instruction stall reduction benefits. This paper presents STREX, a hardware, programmer-transparent technique that exploits typical transaction behavior to improve instruction reuse in first level caches. STREX time-multiplexes the execution of similar transactions dynamically on a single core so that instructions fetched by one transaction are reused by all other transactions executing in the system as much as possible. STREX dynamically slices the execution of each transaction into cache-sized segments simply by observing when blocks are brought in the cache and when they are evicted. Experiments show that, when compared to baseline execution on 2--16 cores, STREX consistently improves performance while reducing the number of L1 instruction and data misses by 37\% and 14\% on average, respectively. Finally, this paper proposes a practical hybrid technique that combines STREX and SLICC, thereby guaranteeing performance benefits regardless of the number of available cores and the workload's footprint.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Paul:2013:CBN, author = "Indrani Paul and Srilatha Manne and Manish Arora and W. Lloyd Bircher and Sudhakar Yalamanchili", title = "Cooperative boosting: needy versus greedy power management", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "285--296", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485947", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "This paper examines the interaction between thermal management techniques and power boosting in a state-of-the-art heterogeneous processor consisting of a set of CPU and GPU cores. We show that for classes of applications that utilize both the CPU and the GPU, modern boost algorithms that greedily seek to convert thermal headroom into performance can interact with thermal coupling effects between the CPU and the GPU to degrade performance. We first examine the causes of this behavior and explain the interaction between thermal coupling, performance coupling, and workload behavior. Then we propose a dynamic power-management approach called cooperative boosting (CB) to allocate power dynamically between CPU and GPU in a manner that balances thermal coupling against the needs of performance coupling to optimize performance under a given thermal constraint. Through real hardware-based measurements, we evaluate CB against a state-of-the-practice boost algorithm and show that overall application performance and power savings increase by 10\% and 8\% (up to 52\% and 34\%), respectively, resulting in average energy efficiency improvement of 25\% (up to 76\%) over a wide range of benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Bacha:2013:DRV, author = "Anys Bacha and Radu Teodorescu", title = "Dynamic reduction of voltage margins by leveraging on-chip {ECC} in {Itanium II} processors", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "297--307", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485948", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Lowering supply voltage is one of the most effective approaches for improving the energy efficiency of microprocessors. Unfortunately, technology limitations, such as process variability and circuit aging, are forcing microprocessor designers to add larger voltage guardbands to their chips. This makes supply voltage increasingly difficult to scale with technology. This paper presents a new mechanism for dynamically reducing voltage margins while maintaining the chip operating frequency constant. Unlike previous approaches that rely on special hardware to detect and recover from timing violations caused by low-voltage execution, our solution is firmware-based and does not require additional hardware. Instead, it relies on error correction mechanisms already built into modern processors. The system dynamically reduces voltage margins and uses correctable error reports raised by the hardware to identify the lowest, safe operating voltage. The solution adapts to core-to-core variability by tailoring supply voltage to each core's safe operating level. In addition, it exploits variability in workload vulnerability to low voltage execution. The system was prototyped on an HP Integrity Server that uses Intel's Itanium 9560 processors. Evaluation using SPECjbb2005 and SPEC CPU2000 workloads shows core power savings ranging from 18\% to 23\%, with minimal performance impact.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cook:2013:HEC, author = "Henry Cook and Miquel Moreto and Sarah Bird and Khanh Dao and David A. Patterson and Krste Asanovic", title = "A hardware evaluation of cache partitioning to improve utilization and energy-efficiency while preserving responsiveness", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "308--319", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485949", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Computing workloads often contain a mix of interactive, latency-sensitive foreground applications and recurring background computations. To guarantee responsiveness, interactive and batch applications are often run on disjoint sets of resources, but this incurs additional energy, power, and capital costs. In this paper, we evaluate the potential of hardware cache partitioning mechanisms and policies to improve efficiency by allowing background applications to run simultaneously with interactive foreground applications, while avoiding degradation in interactive responsiveness. We evaluate these tradeoffs using commercial x86 multicore hardware that supports cache partitioning, and find that real hardware measurements with full applications provide different observations than past simulation-based evaluations. Co-scheduling applications without LLC partitioning leads to a 10\% energy improvement and average throughput improvement of 54\% compared to running tasks separately, but can result in foreground performance degradation of up to 34\% with an average of 6\%. With optimal static LLC partitioning, the average energy improvement increases to 12\% and the average throughput improvement to 60\%, while the worst case slowdown is reduced noticeably to 7\% with an average slowdown of only 2\%. We also evaluate a practical low-overhead dynamic algorithm to control partition sizes, and are able to realize the potential performance guarantees of the optimal static approach, while increasing background throughput by an additional 19\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Das:2013:CEP, author = "Reetuparna Das and Satish Narayanasamy and Sudhir K. Satpathy and Ronald G. Dreslinski", title = "{Catnap}: energy proportional multiple network-on-chip", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "320--331", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485950", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Multiple networks have been used in several processor implementations to scale bandwidth and ensure protocol-level deadlock freedom for different message classes. In this paper, we observe that a multiple-network design is also attractive from a power perspective and can be leveraged to achieve energy proportionality by effective power gating. Unlike a single-network design, a multiple-network design is more amenable to power gating, as its subnetworks (subnets) can be power gated without compromising the connectivity of the network. To exploit this opportunity, we propose the Catnap architecture which consists of synergistic subnet selection and power-gating policies. Catnap maximizes the number of consecutive idle cycles in a router, while avoiding performance loss due to overloading a subnet. We evaluate a 256-core processor with a concentrated mesh topology using synthetic traffic and 35 applications. We show that the average network power of a power-gating optimized multiple-network design with four subnets could be 44\% lower than a bandwidth equivalent single-network design for an average performance cost of about 5\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jog:2013:OSP, author = "Adwait Jog and Onur Kayiran and Asit K. Mishra and Mahmut T. Kandemir and Onur Mutlu and Ravishankar Iyer and Chita R. Das", title = "Orchestrated scheduling and prefetching for {GPGPUs}", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "332--343", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485951", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "In this paper, we present techniques that coordinate the thread scheduling and prefetching decisions in a General Purpose Graphics Processing Unit (GPGPU) architecture to better tolerate long memory latencies. We demonstrate that existing warp scheduling policies in GPGPU architectures are unable to effectively incorporate data prefetching. The main reason is that they schedule consecutive warps, which are likely to access nearby cache blocks and thus prefetch accurately for one another, back-to-back in consecutive cycles. This either (1) causes prefetches to be generated by a warp too close to the time their corresponding addresses are actually demanded by another warp, or (2) requires sophisticated prefetcher designs to correctly predict the addresses required by a future ``far-ahead'' warp while executing the current warp. We propose a new prefetch-aware warp scheduling policy that overcomes these problems. The key idea is to separate in time the scheduling of consecutive warps such that they are not executed back-to-back. We show that this policy not only enables a simple prefetcher to be effective in tolerating memory latencies but also improves memory bank parallelism, even when prefetching is not employed. Experimental evaluations across a diverse set of applications on a 30-core simulated GPGPU platform demonstrate that the prefetch-aware warp scheduler provides 25\% and 7\% average performance improvement over baselines that employ prefetching in conjunction with, respectively, the commonly-employed round-robin scheduler or the recently-proposed two-level warp scheduler. Moreover, when prefetching is not employed, the prefetch-aware warp scheduler provides higher performance than both of these baseline schedulers as it better exploits memory bank parallelism.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jing:2013:EES, author = "Naifeng Jing and Yao Shen and Yao Lu and Shrikanth Ganapathy and Zhigang Mao and Minyi Guo and Ramon Canal and Xiaoyao Liang", title = "An energy-efficient and scalable {eDRAM}-based register file architecture for {GPGPU}", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "344--355", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485952", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "The heavily-threaded data processing demands of streaming multiprocessors (SM) in a GPGPU require a large register file (RF). The fast increasing size of the RF makes the area cost and power consumption unaffordable for traditional SRAM designs in the future technologies. In this paper, we propose to use embedded-DRAM (eDRAM) as an alternative in future GPGPUs. Compared with SRAM, eDRAM provides higher density and lower leakage power. However, the limited data retention time in eDRAM poses new challenges. Periodic refresh operations are needed to maintain data integrity. This is exacerbated with the scaling of eDRAM density, process variations and temperature. Unlike conventional CPUs which make use of multi-ported RF, most of the RFs in modern GPGPU are heavily banked but not multi-ported to reduce the hardware cost. This provides a unique opportunity to hide the refresh overhead. We propose two different eDRAM implementations based on 3T1D and 1T1C memory cells. To mitigate the impact of periodic refresh, we propose two novel refresh solutions using bank bubble and bank walk-through. Plus, for the 1T1C RF, we design an interleaved bank organization together with an intelligent warp scheduling strategy to reduce the impact of the destructive reads. The analysis shows that our schemes present better energy efficiency, scalability and variation tolerance than traditional SRAM-based designs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Rhu:2013:MSR, author = "Minsoo Rhu and Mattan Erez", title = "Maximizing {SIMD} resource utilization in {GPGPUs} with {SIMD} lane permutation", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "356--367", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485953", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Current GPUs maintain high programmability by abstracting the SIMD nature of the hardware as independent concurrent threads of control with hardware responsible for generating predicate masks to utilize the SIMD hardware for different flows of control. This dynamic masking leads to poor utilization of SIMD resources when the control of different threads in the same SIMD group diverges. Prior research suggests that SIMD groups be formed dynamically by compacting a large number of threads into groups, mitigating the impact of divergence. To maintain hardware efficiency, however, the alignment of a thread to a SIMD lane is fixed, limiting the potential for compaction. We observe that control frequently diverges in a manner that prevents compaction because of the way in which the fixed alignment of threads to lanes is done. This paper presents an in-depth analysis on the causes for ineffective compaction. An important observation is that in many cases, control diverges because of programmatic branches, which do not depend on input data. This behavior, when combined with the default mapping of threads to lanes, severely restricts compaction. We then propose SIMD lane permutation (SLP) as an optimization to expand the applicability of compaction in such cases of lane alignment. SLP seeks to rearrange how threads are mapped to lanes to allow even programmatic branches to be compacted effectively, improving SIMD utilization up to 34\% accompanied by a maximum 25\% performance boost.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vaidya:2013:SDO, author = "Aniruddha S. Vaidya and Anahita Shayesteh and Dong Hyuk Woo and Roy Saharoy and Mani Azimi", title = "{SIMD} divergence optimization through intra-warp compaction", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "368--379", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485954", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "SIMD execution units in GPUs are increasingly used for high performance and energy efficient acceleration of general purpose applications. However, SIMD control flow divergence effects can result in reduced execution efficiency in a class of GPGPU applications, classified as divergent applications. Improving SIMD efficiency, therefore, has the potential to bring significant performance and energy benefits to a wide range of such data parallel applications. Recently, the SIMD divergence problem has received increased attention, and several micro-architectural techniques have been proposed to address various aspects of this problem. However, these techniques are often quite complex and, therefore, unlikely candidates for practical implementation. In this paper, we propose two micro-architectural optimizations for GPGPU architectures, which utilize relatively simple execution cycle compression techniques when certain groups of turned-off lanes exist in the instruction stream. We refer to these optimizations as basic cycle compression (BCC) and swizzled-cycle compression (SCC), respectively. In this paper, we will outline the additional requirements for implementing these optimizations in the context of the studied GPGPU architecture. Our evaluations with divergent SIMD workloads from OpenCL (GPGPU) and OpenGL (graphics) applications show that BCC and SCC reduce execution cycles in divergent applications by as much as 42\% (20\% on average). For a subset of divergent workloads, the execution time is reduced by an average of 7\% for today's GPUs or by 18\% for future GPUs with a better provisioned memory subsystem. The key contribution of our work is in simplifying the micro-architecture for delivering divergence optimizations while providing the bulk of the benefits of more complex approaches.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Son:2013:RMA, author = "Young Hoon Son and O. Seongil and Yuhwan Ro and Jae W. Lee and Jung Ho Ahn", title = "Reducing memory access latency with asymmetric {DRAM} bank organizations", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "380--391", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485955", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "DRAM has been a de facto standard for main memory, and advances in process technology have led to a rapid increase in its capacity and bandwidth. In contrast, its random access latency has remained relatively stagnant, as it is still around 100 CPU clock cycles. Modern computer systems rely on caches or other latency tolerance techniques to lower the average access latency. However, not all applications have ample parallelism or locality that would help hide or reduce the latency. Moreover, applications' demands for memory space continue to grow, while the capacity gap between last-level caches and main memory is unlikely to shrink. Consequently, reducing the main-memory latency is important for application performance. Unfortunately, previous proposals have not adequately addressed this problem, as they have focused only on improving the bandwidth and capacity or reduced the latency at the cost of significant area overhead. We propose asymmetric DRAM bank organizations to reduce the average main-memory access latency. We first analyze the access and cycle times of a modern DRAM device to identify key delay components for latency reduction. Then we reorganize a subset of DRAM banks to reduce their access and cycle times by half with low area overhead. By synergistically combining these reorganized DRAM banks with support for non-uniform bank accesses, we introduce a novel DRAM bank organization with center high-aspect-ratio mats called CHARM. Experiments on a simulated chip-multiprocessor system show that CHARM improves both the instructions per cycle and system-wide energy-delay product up to 21\% and 32\%, respectively, with only a 3\% increase in die area.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Liu:2013:CTP, author = "Ziyi Liu and JongHyuk Lee and Junyuan Zeng and Yuanfeng Wen and Zhiqiang Lin and Weidong Shi", title = "{CPU} transparent protection of {OS} kernel and hypervisor integrity with programmable {DRAM}", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "392--403", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485956", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Increasingly, cyber attacks (e.g., kernel rootkits) target the inner rings of a computer system, and they have seriously undermined the integrity of the entire computer systems. To eliminate these threats, it is imperative to develop innovative solutions running below the attack surface. This paper presents MGuard, a new most inner ring solution for inspecting the system integrity that is directly integrated with the DRAM DIMM devices. More specifically, we design a programmable guard that is integrated with the advanced memory buffer of FB-DIMM to continuously monitor all the memory traffic and detect the system integrity violations. Unlike the existing approaches that are either snapshot-based or lack compatibility and flexibility, MGuard continuously monitors the integrity of all the outer rings including both OS kernel and hypervisor of interest, with a greater extendibility enabled by a programmable interface. It offers a hardware drop-in solution transparent to the host CPU and memory controller. Moreover, MGuard is isolated from the host software and hardware, leading to strong security for remote attackers. Our simulation-based experimental results show that MGuard introduces no speed overhead, and is able to detect nearly all the OS-kernel and hypervisor control data related rootkits we tested.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Jevdjic:2013:SDC, author = "Djordje Jevdjic and Stavros Volos and Babak Falsafi", title = "Die-stacked {DRAM} caches for servers: hit ratio, latency, or bandwidth? {Have} it all with footprint cache", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "404--415", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485957", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Recent research advocates using large die-stacked DRAM caches to break the memory bandwidth wall. Existing DRAM cache designs fall into one of two categories --- block-based and page-based. The former organize data in conventional blocks (e.g., 64B), ensuring low off-chip bandwidth utilization, but co-locate tags and data in the stacked DRAM, incurring high lookup latency. Furthermore, such designs suffer from low hit ratios due to poor temporal locality. In contrast, page-based caches, which manage data at larger granularity (e.g., 4KB pages), allow for reduced tag array overhead and fast lookup, and leverage high spatial locality at the cost of moving large amounts of data on and off the chip. This paper introduces Footprint Cache, an efficient die-stacked DRAM cache design for server processors. Footprint Cache allocates data at the granularity of pages, but identifies and fetches only those blocks within a page that will be touched during the page's residency in the cache --- i.e., the page's footprint. In doing so, Footprint Cache eliminates the excessive off-chip traffic associated with page-based designs, while preserving their high hit ratio, small tag array overhead, and low lookup latency. Cycle-accurate simulation results of a 16-core server with up to 512MB Footprint Cache indicate a 57\% performance improvement over a baseline chip without a die-stacked cache. Compared to a state-of-the-art block-based design, our design improves performance by 13\% while reducing dynamic energy of stacked DRAM by 24\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sim:2013:RSD, author = "Jaewoong Sim and Gabriel H. Loh and Vilas Sridharan and Mike O'Connor", title = "Resilient die-stacked {DRAM} caches", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "416--427", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485958", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Die-stacked DRAM can provide large amounts of in-package, high-bandwidth cache storage. For server and high-performance computing markets, however, such DRAM caches must also provide sufficient support for reliability and fault tolerance. While conventional off-chip memory provides ECC support by adding one or more extra chips, this may not be practical in a 3D stack. In this paper, we present a DRAM cache organization that uses error-correcting codes (ECCs), strong checksums (CRCs), and dirty data duplication to detect and correct a wide range of stacked DRAM failures, from traditional bit errors to large-scale row, column, bank, and channel failures. With only a modest performance degradation compared to a DRAM cache with no ECC support, our proposal can correct all single-bit failures, and 99.9993\% of all row, column, and bank failures, providing more than a 54,000x improvement in the FIT rate of silent-data corruptions compared to basic SECDED ECC protection.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Du:2013:BMB, author = "Yu Du and Miao Zhou and Bruce R. Childers and Daniel Moss{\'e} and Rami Melhem", title = "Bit mapping for balanced {PCM} cell programming", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "428--439", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485959", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Write bandwidth is an inherent performance bottleneck for Phase Change Memory (PCM) for two reasons. First, PCM cells have long programming time, and second, only a limited number of PCM cells can be programmed concurrently due to programming current and write circuit constraints, For each PCM write, the data bits of the write request are typically mapped to multiple cell groups and processed in parallel. We observed that an unbalanced distribution of modified data bits among cell groups significantly increases PCM write time and hurts effective write bandwidth. To address this issue, we first uncover the cyclical and cluster patterns for modified data bits. Next, we propose double XOR mapping (D-XOR) to distribute modified data bits among cell groups in a balanced way. D-XOR can reduce PCM write service time by 45\% on average, which increases PCM write throughput by 1.8x. As error correction (redundant bits) is critical for PCM, we also consider the impact of redundancy information in mapping data and error correction bits to cell groups. Our techniques lead to a 51\% average reduction in write service time for a PCM main memory with ECC, which increases IPC by 12\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Seong:2013:TLC, author = "Nak Hee Seong and Sungkap Yeo and Hsien-Hsin S. Lee", title = "Tri-level-cell phase change memory: toward an efficient and reliable memory system", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "440--451", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485960", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "There are several emerging memory technologies looming on the horizon to compensate the physical scaling challenges of DRAM. Phase change memory (PCM) is one such candidate proposed for being part of the main memory in computing systems. One salient feature of PCM is its multi-level-cell (MLC) property, which can be used to multiply the memory capacity at the cell level. However, due to the nature of PCM that the value written to the cell can drift over time, PCM is prone to a unique type of soft errors, posing a great challenge for their practical deployment. This paper first quantitatively studied the current art for MLC PCM in dealing with the resistance drift problem and showed that the previously proposed techniques such as scrubbing or error correction mechanisms have significant reliability challenges to overcome. We then propose tri-level-cell PCM and demonstrate its ability to achieving 10$^5$ x lower soft error rate than four-level-cell PCM and 1.33 x higher information density than single-level-cell PCM. According to our findings, the tri-level-cell PCM shows 36.4\% performance improvement over the four-level-cell PCM while achieving the soft error rate of DRAM.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Azevedo:2013:ZME, author = "Rodolfo Azevedo and John D. Davis and Karin Strauss and Parikshit Gopalan and Mark Manasse and Sergey Yekhanin", title = "Zombie memory: extending memory lifetime by reviving dead blocks", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "452--463", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485961", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Zombie is an endurance management framework that enables a variety of error correction mechanisms to extend the lifetimes of memories that suffer from bit failures caused by wearout, such as phase-change memory (PCM). Zombie supports both single-level cell (SLC) and multi-level cell (MLC) variants. It extends the lifetime of blocks in working memory pages (primary blocks) by pairing them with spare blocks, i.e., working blocks in pages that have been disabled due to exhaustion of a single block's error correction resources, which would be 'dead' otherwise. Spare blocks adaptively provide error correction resources to primary blocks as failures accumulate over time. This reduces the waste caused by early block failures, making working blocks in discarded pages a useful resource. Even though we use PCM as the target technology, Zombie applies to any memory technology that suffers stuck-at cell failures. This paper describes the Zombie framework, a combination of two new error correction mechanisms (ZombieXOR for SLC and ZombieMLC for MLC) and the extension of two previously proposed SLC mechanisms (ZombieECP and ZombieERC). The result is a 58\% to 92\% improvement in endurance for Zombie SLC memory and an even more impressive 11x to 17x improvement for ZombieMLC, both with performance overheads of only 0.1\% when memories using prior error correction mechanisms reach end of life.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Caulfield:2013:QSA, author = "Adrian M. Caulfield and Steven Swanson", title = "{QuickSAN}: a storage area network for fast, distributed, solid state disks", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "464--474", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485962", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Solid State Disks (SSDs) based on flash and other non-volatile memory technologies reduce storage latencies from 10s of milliseconds to 10s or 100s of microseconds, transforming previously inconsequential storage overheads into performance bottlenecks. This problem is especially acute in storage area network (SAN) environments where complex hardware and software layers (distributed file systems, block severs, network stacks, etc.) lie between applications and remote data. These layers can add hundreds of microseconds to requests, obscuring the performance of both flash memory and faster, emerging non-volatile memory technologies. We describe QuickSAN, a SAN prototype that eliminates most software overheads and significantly reduces hardware overheads in SANs. QuickSAN integrates a network adapter into SSDs, so the SSDs can communicate directly with one another to service storage accesses as quickly as possible. QuickSAN can also give applications direct access to both local and remote data without operating system intervention, further reducing software costs. Our evaluation of QuickSAN demonstrates remote access latencies of 20 $ \mu $ s for 4 KB requests, bandwidth improvements of as much as 163x for small accesses compared with an equivalent iSCSI implementation, and 2.3--3.0x application level speedup for distributed sorting. We also show that QuickSAN improves energy efficiency by up to 96\% and that QuickSAN's networking connectivity allows for improved cluster-level energy efficiency under varying load.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sanchez:2013:ZFA, author = "Daniel Sanchez and Christos Kozyrakis", title = "{ZSim}: fast and accurate microarchitectural simulation of thousand-core systems", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "475--486", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485963", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", note = "ICSA '13 conference proceedings.", abstract = "Architectural simulation is time-consuming, and the trend towards hundreds of cores is making sequential simulation even slower. Existing parallel simulation techniques either scale poorly due to excessive synchronization, or sacrifice accuracy by allowing event reordering and using simplistic contention models. As a result, most researchers use sequential simulators and model small-scale systems with 16--32 cores. With 100-core chips already available, developing simulators that scale to thousands of cores is crucial. We present three novel techniques that, together, make thousand-core simulation practical. First, we speed up detailed core models (including OOO cores) with instruction-driven timing models that leverage dynamic binary translation. Second, we introduce bound-weave, a two-phase parallelization technique that scales parallel simulation on multicore hosts efficiently with minimal loss of accuracy. Third, we implement lightweight user-level virtualization to support complex workloads, including multiprogrammed, client-server, and managed-runtime applications, without the need for full-system simulation, sidestepping the lack of scalable OSs and ISAs that support thousands of cores. We use these techniques to build zsim, a fast, scalable, and accurate simulator. On a 16-core host, zsim models a 1024-core chip at speeds of up to 1,500 MIPS using simple cores and up to 300 MIPS using detailed OOO cores, 2-3 orders of magnitude faster than existing parallel simulators. Simulator performance scales well with both the number of modeled cores and the number of host cores. We validate zsim against a real Westmere system on a wide variety of workloads, and find performance and microarchitectural events to be within a narrow range of the real system.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Leng:2013:GEE, author = "Jingwen Leng and Tayler Hetherington and Ahmed ElTantawy and Syed Gilani and Nam Sung Kim and Tor M. Aamodt and Vijay Janapa Reddi", title = "{GPUWattch}: enabling energy optimizations in {GPGPUs}", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "487--498", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485964", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "General-purpose GPUs (GPGPUs) are becoming prevalent in mainstream computing, and performance per watt has emerged as a more crucial evaluation metric than peak performance. As such, GPU architects require robust tools that will enable them to quickly explore new ways to optimize GPGPUs for energy efficiency. We propose a new GPGPU power model that is configurable, capable of cycle-level calculations, and carefully validated against real hardware measurements. To achieve configurability, we use a bottom-up methodology and abstract parameters from the microarchitectural components as the model's inputs. We developed a rigorous suite of 80 microbenchmarks that we use to bound any modeling uncertainties and inaccuracies. The power model is comprehensively validated against measurements of two commercially available GPUs, and the measured error is within 9.9\% and 13.4\% for the two target GPUs (GTX 480 and Quadro FX5600). The model also accurately tracks the power consumption trend over time. We integrated the power model with the cycle-level simulator GPGPU-Sim and demonstrate the energy savings by utilizing dynamic voltage and frequency scaling (DVFS) and clock gating. Traditional DVFS reduces GPU energy consumption by 14.4\% by leveraging within-kernel runtime variations. More finer-grained SM cluster-level DVFS improves the energy savings from 6.6\% to 13.6\% for those benchmarks that show clustered execution behavior. We also show that clock gating inactive lanes during divergence reduces dynamic power by 11.2\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wu:2013:SMP, author = "Meng-Ju Wu and Minshu Zhao and Donald Yeung", title = "Studying multicore processor scaling via reuse distance analysis", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "499--510", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485965", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "The trend for multicore processors is towards increasing numbers of cores, with 100s of cores-- i.e. large-scale chip multiprocessors (LCMPs) --- possible in the future. The key to realizing the potential of LCMPs is the cache hierarchy, so studying how memory performance will scale is crucial. Reuse distance (RD) analysis can help architects do this. In particular, recent work has developed concurrent reuse distance (CRD) and private reuse distance (PRD) profiles to enable analysis of shared and private caches. Also, techniques have been developed to predict profiles across problem size and core count, enabling the analysis of configurations that are too large to simulate. This paper applies RD analysis to study the scalability of multicore cache hierarchies. We present a framework based on CRD and PRD profiles for reasoning about the locality impact of core count and problem scaling. We find interference-based locality degradation is more significant than sharing-based locality degradation. For 256 cores running small problems, the former occurs at small cache sizes, allowing moderate capacity scaling of multicore caches to achieve the same cache performance (MPKI) as a single-core cache. At very large problems, interference-based locality degradation increases significantly in many of our benchmarks. For shared caches, this prevents most of our benchmarks from achieving constant-MPKI scaling within a 256 MB capacity budget; for private caches, all benchmarks cannot achieve constant-MPKI scaling within 256 MB.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{DuBois:2013:CSI, author = "Kristof {Du Bois} and Stijn Eyerman and Jennifer B. Sartor and Lieven Eeckhout", title = "Criticality stacks: identifying critical threads in parallel programs using synchronization behavior", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "511--522", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485966", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Analyzing multi-threaded programs is quite challenging, but is necessary to obtain good multicore performance while saving energy. Due to synchronization, certain threads make others wait, because they hold a lock or have yet to reach a barrier. We call these critical threads, i.e., threads whose performance is determinative of program performance as a whole. Identifying these threads can reveal numerous optimization opportunities, for the software developer and for hardware. In this paper, we propose a new metric for assessing thread criticality, which combines both how much time a thread is performing useful work and how many co-running threads are waiting. We show how thread criticality can be calculated online with modest hardware additions and with low overhead. We use our metric to create criticality stacks that break total execution time into each thread's criticality component, allowing for easy visual analysis of parallel imbalance. To validate our criticality metric, and demonstrate it is better than previous metrics, we scale the frequency of the most critical thread and show it achieves the largest performance improvement. We then demonstrate the broad applicability of criticality stacks by using them to perform three types of optimizations: (1) program analysis to remove parallel bottlenecks, (2) dynamically identifying the most critical thread and accelerating it using frequency scaling to improve performance, and (3) showing that accelerating only the most critical thread allows for targeted energy reduction.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kurian:2013:LAA, author = "George Kurian and Omer Khan and Srinivas Devadas", title = "The locality-aware adaptive cache coherence protocol", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "523--534", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485967", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Next generation multicore applications will process massive amounts of data with significant sharing. Data movement and management impacts memory access latency and consumes power. Therefore, harnessing data locality is of fundamental importance in future processors. We propose a scalable, efficient shared memory cache coherence protocol that enables seamless adaptation between private and logically shared caching of on-chip data at the fine granularity of cache lines. Our data-centric approach relies on in-hardware yet low-overhead runtime profiling of the locality of each cache line and only allows private caching for data blocks with high spatio-temporal locality. This allows us to better exploit the private caches and enable low-latency, low-energy memory access, while retaining the convenience of shared memory. On a set of parallel benchmarks, our low-overhead locality-aware mechanisms reduce the overall energy by 25\% and completion time by 15\% in an NoC-based multicore with the Reactive-NUCA on-chip cache organization and the ACKwise limited directory-based coherence protocol.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kaxiras:2013:NPE, author = "Stefanos Kaxiras and Alberto Ros", title = "A new perspective for efficient virtual-cache coherence", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "535--546", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485968", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Coherent shared virtual memory (cSVM) is highly coveted for heterogeneous architectures as it will simplify programming across different cores and manycore accelerators. In this context, virtual L1 caches can be used to great advantage, e.g., saving energy consumption by eliminating address translation for hits. Unfortunately, multicore virtual-cache coherence is complex and costly because it requires reverse translation for any coherence request directed towards a virtual L1. The reason is the ambiguity of the virtual address due to the possibility of synonyms. In this paper, we take a radically different approach than all prior work which is focused on reverse translation. We examine the problem from the perspective of the coherence protocol. We show that if a coherence protocol adheres to certain conditions, it operates effortlessly with virtual caches, without requiring reverse translations even in the presence of synonyms. We show that these conditions hold in a new class of simple and efficient request-response protocols that use both self-invalidation and self-downgrade. This results in a new solution for virtual-cache coherence, significantly less complex and more efficient than prior proposals. We study design choices for TLB placement under our proposal and compare them against those under a directory-MESI protocol. Our approach allows for choices that are particularly effective as for example combining all per-core TLBs in a single logical TLB in front of the last level cache. Significant area, energy, and performance benefits ensue as a result of simplifying the entire multicore memory organization.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhao:2013:PAG, author = "Hongzhou Zhao and Arrvindh Shriraman and Snehasish Kumar and Sandhya Dwarkadas", title = "{Protozoa}: adaptive granularity cache coherence", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "547--558", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485969", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "State-of-the-art multiprocessor cache hierarchies propagate the use of a fixed granularity in the cache organization to the design of the coherence protocol. Unfortunately, the fixed granularity, generally chosen to match average spatial locality across a range of applications, not only results in wasted bandwidth to serve an individual thread's access needs, but also results in unnecessary coherence traffic for shared data. The additional bandwidth has a direct impact on both the scalability of parallel applications and overall energy consumption. In this paper, we present the design of Protozoa, a family of coherence protocols that eliminate unnecessary coherence traffic and match data movement to an application's spatial locality. Protozoa continues to maintain metadata at a conventional fixed cache line granularity while (1) supporting variable read and write caching granularity so that data transfer matches application spatial granularity, (2) invalidating at the granularity of the write miss request so that readers to disjoint data can co-exist with writers, and (3) potentially supporting multiple non-overlapping writers within the cache line, thereby avoiding the traditional ping-pong effect of both read-write and write-write false sharing. Our evaluation demonstrates that Protozoa consistently reduce miss rate and improve the fraction of transmitted data that is actually utilized.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Demme:2013:FOM, author = "John Demme and Matthew Maycock and Jared Schmitz and Adrian Tang and Adam Waksman and Simha Sethumadhavan and Salvatore Stolfo", title = "On the feasibility of online malware detection with performance counters", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "559--570", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485970", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "The proliferation of computers in any domain is followed by the proliferation of malware in that domain. Systems, including the latest mobile platforms, are laden with viruses, rootkits, spyware, adware and other classes of malware. Despite the existence of anti-virus software, malware threats persist and are growing as there exist a myriad of ways to subvert anti-virus (AV) software. In fact, attackers today exploit bugs in the AV software to break into systems. In this paper, we examine the feasibility of building a malware detector in hardware using existing performance counters. We find that data from performance counters can be used to identify malware and that our detection techniques are robust to minor variations in malware programs. As a result, after examining a small set of variations within a family of malware on Android ARM and Intel Linux platforms, we can detect many variations within that family. Further, our proposed hardware modifications allow the malware detector to run securely beneath the system software, thus setting the stage for AV implementations that are simpler and less buggy than software AV. Combined, the robustness and security of hardware AV techniques have the potential to advance state-of-the-art online malware detection.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ren:2013:DSE, author = "Ling Ren and Xiangyao Yu and Christopher W. Fletcher and Marten van Dijk and Srinivas Devadas", title = "Design space exploration and optimization of path oblivious {RAM} in secure processors", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "571--582", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485971", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Keeping user data private is a huge problem both in cloud computing and computation outsourcing. One paradigm to achieve data privacy is to use tamper-resistant processors, inside which users' private data is decrypted and computed upon. These processors need to interact with untrusted external memory. Even if we encrypt all data that leaves the trusted processor, however, the address sequence that goes off-chip may still leak information. To prevent this address leakage, the security community has proposed ORAM (Oblivious RAM). ORAM has mainly been explored in server/file settings which assume a vastly different computation model than secure processors. Not surprisingly, na{\"\i}vely applying ORAM to a secure processor setting incurs large performance overheads. In this paper, a recent proposal called Path ORAM is studied. We demonstrate techniques to make Path ORAM practical in a secure processor setting. We introduce background eviction schemes to prevent Path ORAM failure and allow for a performance-driven design space exploration. We propose a concept called super blocks to further improve Path ORAM's performance, and also show an efficient integrity verification scheme for Path ORAM. With our optimizations, Path ORAM overhead drops by 41.8\%, and SPEC benchmark execution time improves by 52.4\% in relation to a baseline configuration. Our work can be used to improve the security level of previous secure processors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wassel:2013:SLL, author = "Hassan M. G. Wassel and Ying Gao and Jason K. Oberg and Ted Huffmire and Ryan Kastner and Frederic T. Chong and Timothy Sherwood", title = "{SurfNoC}: a low latency and provably non-interfering approach to secure networks-on-chip", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "583--594", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485972", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "As multicore processors find increasing adoption in domains such as aerospace and medical devices where failures have the potential to be catastrophic, strong performance isolation and security become first-class design constraints. When cores are used to run separate pieces of the system, strong time and space partitioning can help provide such guarantees. However, as the number of partitions or the asymmetry in partition bandwidth allocations grows, the additional latency incurred by time multiplexing the network can significantly impact performance. In this paper, we introduce SurfNoC, an on-chip network that significantly reduces the latency incurred by temporal partitioning. By carefully scheduling the network into waves that flow across the interconnect, data from different domains carried by these waves are strictly non-interfering while avoiding the significant overheads associated with cycle-by-cycle time multiplexing. We describe the scheduling policy and router microarchitecture changes required, and evaluate the information-flow security of a synthesizable implementation through gate-level information flow analysis. When comparing our approach for varying numbers of domains and network sizes, we find that in many cases SurfNoC can reduce the latency overhead of implementing cycle-level non-interference by up to 85\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wang:2013:VPD, author = "Di Wang and Chuangang Ren and Anand Sivasubramaniam", title = "Virtualizing power distribution in datacenters", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "595--606", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485973", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", note = "ICSA '13 conference proceedings.", abstract = "Power infrastructure contributes to a significant portion of datacenter expenditures. Overbooking this infrastructure for a high percentile of the needs is becoming more attractive than for occasional peaks. There exist several computing knobs to cap the power draw within such under-provisioned capacity. Recently, batteries and other energy storage devices have been proposed to provide a complementary alternative to these knobs, which when decentralized (or hierarchically placed), can temporarily take the load to suppress power peaks propagating up the hierarchy. With aggressive under-provisioning, the power hierarchy becomes as central a datacenter resource as other computing resources, making it imperative to carefully allocate, isolate and manage this resource (including batteries), across applications. Towards this goal, we present vPower, a software system to virtualize power distribution. vPower includes mechanisms and policies to provide a virtual power hierarchy for each application. It leverages traditional computing knobs as well as batteries, to apportion and manage the infrastructure between co-existing applications in the hierarchy. vPower allows applications to specify their power needs, performs admission control and placement, dynamically monitors power usage, and enforces allocations for fairness and system efficiency. Using several datacenter applications, and a 2-level power hierarchy prototype containing batteries at both levels, we demonstrate the effectiveness of vPower when working in an under-provisioned power infrastructure, using the right computing knobs and the right batteries at the right time. Results show over 50\% improved system utilization and scale-out for vPower's over-booking, and between 12--28\% better application performance than traditional power-capping control knobs. It also ensures isolation between applications competing for power.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yang:2013:BFP, author = "Hailong Yang and Alex Breslow and Jason Mars and Lingjia Tang", title = "{Bubble-Flux}: precise online {QoS} management for increased utilization in warehouse scale computers", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "607--618", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485974", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Ensuring the quality of service (QoS) for latency-sensitive applications while allowing co-locations of multiple applications on servers is critical for improving server utilization and reducing cost in modern warehouse-scale computers (WSCs). Recent work relies on static profiling to precisely predict the QoS degradation that results from performance interference among co-running applications to increase the number of ``safe'' co-locations. However, these static profiling techniques have several critical limitations: (1) a priori knowledge of all workloads is required for profiling, (2) it is difficult for the prediction to capture or adapt to phase or load changes of applications, and (3) the prediction technique is limited to only two co-running applications. To address all of these limitations, we present Bubble-Flux, an integrated dynamic interference measurement and online QoS management mechanism to provide accurate QoS control and maximize server utilization. Bubble-Flux uses a Dynamic Bubble to probe servers in real time to measure the instantaneous pressure on the shared hardware resources and precisely predict how the QoS of a latency-sensitive job will be affected by potential co-runners. Once ``safe'' batch jobs are selected and mapped to a server, Bubble-Flux uses an Online Flux Engine to continuously monitor the QoS of the latency-sensitive application and control the execution of batch jobs to adapt to dynamic input, phase, and load changes to deliver satisfactory QoS. Batch applications remain in a state of flux throughout execution. Our results show that the utilization improvement achieved by Bubble-Flux is up to 2.2x better than the prior static approach.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mars:2013:WMH, author = "Jason Mars and Lingjia Tang", title = "{Whare-map}: heterogeneity in ``homogeneous'' warehouse-scale computers", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "619--630", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485975", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Modern ``warehouse scale computers'' (WSCs) continue to be embraced as homogeneous computing platforms. However, due to frequent machine replacements and upgrades, modern WSCs are in fact composed of diverse commodity microarchitectures and machine configurations. Yet, current WSCs are architected with the assumption of homogeneity, leaving a potentially significant performance opportunity unexplored. In this paper, we expose and quantify the performance impact of the ``homogeneity assumption'' for modern production WSCs using industry-strength large-scale web-service workloads. In addition, we argue for, and evaluate the benefits of, a heterogeneity-aware WSC using commercial web-service production workloads including Google's web-search. We also identify key factors impacting the available performance opportunity when exploiting heterogeneity and introduce a new metric, opportunity factor, to quantify an application's sensitivity to the heterogeneity in a given WSC. To exploit heterogeneity in ``homogeneous'' WSCs, we propose ``Whare-Map,'' the W{\sc H}eterogeneity Aw{\sc are Mapper} that leverages already in-place continuous profiling subsystems found in production environments. When employing ``Whare-Map'', we observe a cluster-wide performance improvement of 15\% on average over heterogeneity --- oblivious job placement and up to an 80\% improvement for web-service applications that are particularly sensitive to heterogeneity.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Foutris:2013:DMA, author = "Nikos Foutris and Dimitris Gizopoulos and Xavier Vera and Antonio Gonzalez", title = "Deconfigurable microprocessor architectures for silicon debug acceleration", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "631--642", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485976", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "The share of silicon debug in the overall microprocessor chips development cycle is rapidly expanding due to the ever growing design complexity and the limited efficiency of pre-silicon validation methods. Massive application of short random test programs on the prototype microprocessor chips is one of the most effective parts of silicon debug. However, a major bottleneck and source of ``noise'' in this phase is that large numbers of random test programs fail due to the same or similar design bugs. This redundant behavior adds long delays in the debug flow since each failing random program must be separately examined, although it does not usually bring new debug information. The development of effective techniques that detect dominant modes of failure among random programs and triage them into common categories eliminate redundant debug sessions and significantly boost silicon debug. We propose the employment of deconfigurable microprocessor architectures along with self-checking random test programs to reduce the redundant debug sessions and make the triage step of silicon debug more efficient. Several hardware components of high performance microprocessor micro-architectures can be deconfigured while keeping the functional completeness of the design. This is the property we exploit in our silicon debug methodology for the triaging of random test programs. We support our methodology by a hardware mechanism dedicated to silicon debug that groups the failing test programs into categories depending on the microprocessor hardware components that need to be deconfigured for a random test program to be correctly executed. Identical deconfiguration sequences for multiple test programs indicate the existence of redundancy among them and group them together. This grouping significantly reduces the number of failing tests that must be debugged afterwards. Detailed evaluation of the method on an x86 microprocessor demonstrates its efficiency in reducing the debug sessions and thus in accelerating silicon debug.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Pokam:2013:QPI, author = "Gilles Pokam and Klaus Danne and Cristiano Pereira and Rolf Kassa and Tim Kranich and Shiliang Hu and Justin Gottschlich and Nima Honarmand and Nathan Dautenhahn and Samuel T. King and Josep Torrellas", title = "{QuickRec}: prototyping an {Intel} architecture extension for record and replay of multithreaded programs", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "643--654", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485977", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "There has been significant interest in hardware-assisted deterministic Record and Replay (RnR) systems for multithreaded programs on multiprocessors. However, no proposal has implemented this technique in a hardware prototype with full operating system support. Such an implementation is needed to assess RnR practicality. This paper presents QuickRec, the first multicore Intel Architecture (IA) prototype of RnR for multithreaded programs. QuickRec is based on QuickIA, an Intel emulation platform for rapid prototyping of new IA extensions. QuickRec is composed of a Xeon server platform with FPGA-emulated second-generation Pentium cores, and Capo3, a full software stack for managing the recording hardware from within a modified Linux kernel. This paper's focus is understanding and evaluating the implementation issues of RnR on a real platform. Our effort leads to some lessons learned, as well as to some pointers for future research. We demonstrate that RnR can be implemented efficiently on a real multicore IA system. In particular, we show that the rate of memory log generation is insignificant, and that the recording hardware has negligible performance overhead. However, the software stack incurs an average recording overhead of nearly 13\%, which must be reduced to enable always-on use of RnR.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Huang:2013:NRC, author = "Ruirui Huang and Erik Halberg and G. Edward Suh", title = "Non-race concurrency bug detection through order-sensitive critical sections", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "655--666", month = jun, year = "2013", DOI = "https://doi.org/10.1145/2508148.2485978", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "This paper introduces a new heuristic condition for non-race concurrency bugs, named order-sensitive critical sections, and proposes a run-time bug detection scheme based on the condition. The order-sensitive critical sections are defined as a pair of critical sections that can lead to non-deterministic shared memory state depending on the order in which they execute. In a sense, the order-sensitive critical sections can be seen as extending the intuition in using data races as a potential bug condition to capture non-race bugs. Experiments show that the proposed scheme provides a good coverage for multiple types of non-race bugs, with a small number of false positives. For example, the scheme detected all 9 real-world non-race bugs that were tested as well as over 90\% of injected non-race bugs. Additionally, this paper presents an efficient hardware architecture that supports the proposed scheme with minor hardware changes and a small amount of additional state --- a 9-KB buffer per core and a 1-bit tag per data cache block. The hardware-based scheme could still detect all 9 real-world bugs that were tested and more than 84\% of the injected non-race bugs. Moreover, the hardware supported scheme has a negligible impact on performance, with a 0.23\% slowdown on average.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Maitra:2013:HEM, author = "Subhashis Maitra and Amitabha Sinha", title = "High efficiency {MAC} unit used in digital signal processing and elliptic curve cryptography", journal = j-COMP-ARCH-NEWS, volume = "41", number = "4", pages = "1--7", month = sep, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2560488.2560490", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 2 17:25:55 MST 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Computational complexities of different Algorithms to enhance the speed of response of digital signal processor and different cryptographic analysis are the important issues for the current researcher. Computational complexities means hardware complexities and timing complexities. Both the complexities depend on the design of the software and hardware. Arithmetic computation like addition and multiplication are the major parts in designing processor that helps to improve the efficiency and to reduce complexities. Hence the design of a multiplier unit is the major issue to the current researchers. There are different multiplication algorithms discussed in different research materials. In this paper, a new algorithm for multiplication has been proposed to enhance the speed of operation and to reduce hardware complexities. Also a comparative study of the proposed algorithm over different existing algorithms has been explained here along with VHDL model of the proposed architecture.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Janjusic:2013:GMP, author = "Tomislav Janjusic and Krishna Kavi", title = "{Gleipnir}: a memory profiling and tracing tool", journal = j-COMP-ARCH-NEWS, volume = "41", number = "4", pages = "8--12", month = sep, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2560488.2560491", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 2 17:25:55 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this article we describe a memory tracing and profiling tool called Gleipnir. Gleipnir is a plug-in tool for a widely used binary instrumentation framework, Valgrind. Gleipnir's ability to collect fine grained memory traces and associate each access to source level data structures and elements of these structures, makes it a good candidate tool for advanced memory analysis and studying complex memory access patterns. The data provided by Gleipnir may be used by cache simulators to analyze accesses to data structure elements and understand the dynamic memory behavior of programs. The goal of Gleipnir is to give the programmer aid in refactoring data and code. In addition to Gleipnir we introduce a cache simulation tool, Gl cSim. Gl cSim is an extension to DineroIV (a uni-processor simulator) that tracks Gleipnir provided trace and debug-information.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2013:INb, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "41", number = "4", pages = "13--22", month = sep, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2560488.2560493", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 2 17:25:55 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Godard:2013:MSS, author = "Ivan Godard", title = "The {Mill}: split-stream encoding", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "1--5", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641363", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Real-world programs often thrash in the instruction cache, especially when SMT methods are used. The MillTM split-stream encoding doubles the effective capacity of the instruction cache at no increase in per-instruction power usage or cache access latency, while also sharply increasing the potential maximal decode rate for instruction sets that use variable-length encoding.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thomasian:2013:DAM, author = "Alexander Thomasian", title = "Disk arrays with multiple {RAID} levels", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "6--24", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641364", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We propose Heterogeneous Disk Arrays (HDAs), which allow multiple RAID levels for database applications to coexist in a single disk array accommodating multiple RAID levels. Our main concern is to efficiently utilize disk bandwidth and capacity, while balancing disk loads in a cloud storage environment, however, a small number of disks is considered in this study for illustrative purposes. Individual RAID levels are adjusted to data availability requirements and workload demands. Adopting the most stringent availability requirements for all datasets would incur unnecessarily high bandwidth overhead for updating datasets, which do not have this requirement. Intermixing RAID levels is beneficial from the viewpoint of balancing disk loads, similarly to the striping paradigm in RAID5. The suitability of the RAID levels varies with database applications: RAID5 --- reading/writing large datasets for data mining and warehousing, RAID1 -high performance OLTP applications. Several single pass data allocation methods are proposed in this paper and compared using synthetically generated allocation requests in associated papers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Maitra:2013:DSM, author = "Subhashis Maitra and Amitabha Sinha", title = "Design and simulation of {MAC} unit using combinational circuit and adder", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "25--33", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641365", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Hardware and timing complexities of MAC unit to perform arithmetic operation like addition or multiplication especially in the field of Digital Signal Processing (DSP) or Elliptic Curve Cryptography (ECC) are the major issues to the designer. The multiplication operation is essential and abundant in DSP Applications. In order to achieve maximum implementation efficiency and timing performance, designing a DSP systems is critical and frequently presents a significant challenge to hardware engineers. There are certain multipliers that simplify this challenge by abstracting away FPGA device specifics, while maintaining the required maximum performance and resource efficiency. These multipliers are able to perform parallel multiplication and hence constant coefficient multiplication, both with differing implementation styles. Again with the aid of instantaneous resource estimation, hardware engineers can rapidly select the optimal solution for their system. The latest additions to the IP provide fine control over the latency using the concept of pipelining of the multipliers that are purely combinatorial to be fully pipelined. Here a new compensation method that reduces both the hardware and timing complexities of the multiplier used for DSP application or ECC application has been proposed. The design of the MAC unit based on the proposed compensation method has been dealt here properly using Xilinx 13.2 and compared with array multiplier, Booth multiplier and Vedic multiplier to show its novelty over them. The hardware complexity is reduced to about 60\% of the original multiplier. Design results show that the proposed architecture has lower hardware overhead, lower error and fast operating speed as compared with array, Booth and Vedic multiplier.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chau:2013:ASM, author = "Thomas C. P. Chau and James S. Targett and Marlon Wijeyasinghe and Wayne Luk and Peter Y. K. Cheung and Benjamin Cope and Alison Eele and Jan Maciejowski", title = "Accelerating sequential {Monte Carlo} method for real-time air traffic management", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "35--40", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641367", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents how field-programmable gate arrays (FPGAs) are used to accelerate the Sequential Monte Carlo method for air traffic management. A novel data structure is introduced for a particle stream that enables efficient evaluation of constraints and weights. A parallel implementation for this streaming data structure is designed, and an analytical model is provided for estimating the performance and resource usage of our implementation. We compare our design to implementations on CPU and GPU. We show 9.3 times speed up and 89 times improvement in energy efficiency over an Intel Core i7-950 CPU with 8 threads and demonstrate 1.3 times speed up and 13.5 times improvement in energy efficiency over an NVIDIA Tesla C2070 GPU with 448 cores. We also estimate the performance of FPGA in future scenario and show that FPGA is able to control 15 times and 2.8 times more aircraft than CPU and GPU in real-time respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Mahram:2013:NBC, author = "Atabak Mahram and Martin C. Herbordt", title = "{NCBI BLASTP} on the {Convey HC1-EX}", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "41--46", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641368", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The BLAST sequence alignment program is a central application in bioinformatics. The de facto standard version, NCBI BLAST, uses complex heuristics which make it challenging to simultaneously achieve both high performance and exact agreement. In previous work, a system that used novel FPGA-based filters reduced the input database by over 99.97\% without loss of sensitivity. In the present work we report experiences in getting from a prototype to a potential product for the Convey HC1-EX. There are several issues. The first is the efforts made to maintain timing for a highly complex configuration as it is optimized by including additional filter stages. This requires implementation and optimization of new interface logic as well as floor-planning. The second is the system-level tradeoffs necessary to maintain correctness. The issue here is preventing low frequency events, which necessarily cannot be mapped to the FPGA, from diluting the performance benefits without sacrificing sensitivity. We present results for various usage scenarios and find a factor of nearly 5x speed-up over a fully parallel implementation of the reference code on a contemporaneous CPU. We believe that the resulting system is the leading accelerated NCBI BLAST. The significance of this work is that, while such in-depth work is necessary to achieve high performance for complex systems, these issues are rarely described in the academic literature.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sano:2013:ECC, author = "Kentaro Sano and Yoshiaki Kono and Hayato Suzuki and Ryotaro Chiba and Ryo Ito and Tomohiro Ueno and Kyo Koizumi and Satoru Yamamoto", title = "Efficient custom computing of fully-streamed lattice {Boltzmann} method on tightly-coupled {FPGA} cluster", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "47--52", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641369", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents the detailed design of a custom computing machine for fully-streamed LBM computation on multiple FPGAs, and evaluates its efficiency with prototype implementation. We design a unit for completely streamed computation including boundary treatment with a newly introduced cell attribute. Experimental results demonstrate that the proposed machine achieves high utilization of PEs, 99 \% of the peak performance, for one and two FPGAs computing a large lattice. This is due to our fully-streamed design to allow all arithmetic units to be efficiently utilized with a constant memory bandwidth, and the architecture to exploit a low-latency accelerator domain network (ADN) of a tightly-coupled FPGA cluster for scalable computation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Vanderbauwhede:2013:HCF, author = "Wim Vanderbauwhede and Anton Frolov and Sai Rahul Chalamalasetti and Martin Margala", title = "A hybrid {CPU--FPGA} system for high throughput {(10Gb/s)} streaming document classification", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "53--58", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641370", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Processing large volumes of information in real time requires large amounts of computational power, which consumes a significant amount of energy. With the rise in the amount of data produced, energy-efficient high-performance information processing systems are becoming a necessity. We present a hybrid CPU-FPGA system for high-throughput classification of streams of textual documents (e.g. emails or web pages). The current system parses the document stream using a multicore CPU and performs classification on the parsed stream using Field-Programmable Gate Arrays (FPGAs). As an example, we demonstrate a Naive Bayes classifier on the TREC Aquaint dataset. Our current solution can classify 10Gb/s internet traffic in real time. Our aim is to increase the throughput to 100Gb/s by incorporating the parser into the FPGA design.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Guo:2013:CPE, author = "Ce Guo and Wayne Luk and Ekaterina Vinkovskaya and Rama Cont", title = "Customisable pipelined engine for intensity evaluation in multivariate {Hawkes} point processes", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "59--64", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641371", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Hawkes processes are point processes that can be used to build probabilistic models to capture occurrence patterns of random events. They are widely used in high-frequency trading, seismic analysis and neuroscience. A critical calculation in Hawkes process models is intensity evaluation. The intensity of a point process represents the instantaneous rate of occurrence of events, but it is computationally expensive and challenging to calculate efficiently in order to make predictions using Hawkes process models. To accelerate the computation, we analyse data dependency in the intensity evaluation routine, and present a strategy to enable multiple intensities to be computed with a single pass through the data. We then design and optimise a pipelined hardware engine based on our strategy. In our experiments, an FPGA-based implementation of the proposed engine is evaluated by four case studies. This implementation achieves up to 94 times speedup over an optimised CPU implementation with one core, and 12 times speedup over a CPU with eight cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Giefers:2013:AFD, author = "Heiner Giefers and Christian Plessl and Jens F{\"o}rstner", title = "Accelerating finite difference time domain simulations with reconfigurable dataflow computers", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "65--70", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641372", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Finite difference methods are widely used, highly parallel algorithms for solving differential equations. However, the algorithms are memory bound and thus difficult to implement efficiently on CPUs or GPUs. In this work we study the implementation of the finite difference time domain (FDTD) method for solving Maxwell's equations on an FPGA-based Maxeler dataflow computer. We evaluate our work with actual problems from the domain of computational nanophotonics. The use of realistic simulations requires us to pay special attention to boundary conditions (Dirichlet, periodic, absorbing), which are critical for the correctness of results but detrimental to the performance and thus frequently neglected. We discuss and evaluate the design of two different FDTD implementations, which outperform CPU and GPU implementations. To our knowledge, our implementation is the fastest FPGA-based FDTD solver.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ogawa:2013:RJA, author = "Yuki Ogawa and Masahiro Iida and Motoki Amagasaki and Morihiro Kuga and Toshinori Sueyoshi", title = "A reconfigurable {Java} accelerator with software compatibility for embedded systems", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "71--76", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641373", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ohkawa:2013:RHO, author = "Takeshi Ohkawa and Daichi Uetake and Takashi Yokota and Kanemitsu Ootsu and Takanobu Baba", title = "Reconfigurable and hardwired {ORB} engine on {FPGA} by {Java-to-HDL} synthesizer for realtime application", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "77--82", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641374", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A platform for networked FPGA system design, which is named ``ORB Engine'', is proposed to add more controllability and design productivity on FPGA-based systems composed of software and hardwired IPs. A developer can define an object-oriented interface for the circuit IP in FPGA, and implement the control sequence part using Java. The circuit IP in FPGA can be handled through object-oriented interface from variety of programming languages like C++, Java, Python, Ruby and so on. Application specific and high-efficiency circuit for ORB (Object Request Broker) protocol processing is synthesized from easy-handling Java code using JavaRock Java-to-HDL synthesizer within the de-facto standard CORBA (Common Object Request Broker Architecture). The measurement result shows a very low latency as low as 200us of UDP/IP packet in/out and exhibits a fluctuation free delay performance, which is desirable for real-time applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{deDinechin:2013:FPT, author = "Florent de Dinechin and Matei Istoan and Guillaume Sergent", title = "Fixed-point trigonometric functions on {FPGAs}", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "83--88", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641375", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/elefunt.bib; https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Three approaches for computing sines and cosines on FPGAs are studied in this paper, with a focus of high-throughput pipelined architecture, and state-of-the-art implementation techniques. The first approach is the classical CORDIC iteration, for which we suggest a reduced iteration technique and fine optimizations in datapath width and latency. The second is an ad-hoc architecture specifically designed around trigonometric identities. The third uses a generic table- and DSP-based polynomial approximator. These three architectures are implemented and compared in the FloPoCo framework.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tada:2013:PED, author = "Jubee Tada", title = "Performance evaluation of {$3$-D} stacked $ 32$-bit parallel multipliers", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "89--94", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641376", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Conventional two-dimensional (2-D) implementation technologies face certain limitations; to overcome these limitations, three-dimensional (3-D) integration technologies have been developed. There has been a focus on circuit partitioning strategies because they play an important role in exploiting the potential of 3-D stacked circuits. The Middle-Grain circuit partitioning strategy has been proposed to exploit the potential of 3-D stacked circuits. The proposed strategy equalizes the area of each layer and avoids the critical paths across different layers as much as possible. In this study, 3-D stacked parallel multipliers are designed using various circuit partitioning strategies. Experimental results demonstrate that the 3-D stacked 32-bit parallel multiplier, designed using the proposed strategy, achieves a 27\% delay reduction as compared to the 2-D implementation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tanaka:2013:USP, author = "Yuichiroh Tanaka and Shimpei Sato and Kenji Kise", title = "The {UltraSmall} soft processor", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "95--100", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641377", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A soft processor is a processor that is implemented using logic synthesis mainly targeting programmable logic device like FPGA and it becomes a common component for FPGA designs. The supersmall soft processor (small-core) developed at University of Toronto is a unique soft processor because its main concern is very low hardware cost while supporting 32-bit ISA. With the same concept as small-core, we are developing the ultrasmall soft processor (UltraSmall) based on smallcore. The goal of this project is to implement the smallest 32-bit ISA soft processor while aiming to achieve high performance. We propose UltraSmall and describe its key ideas and implementations. The evaluation results indicate that the hardware cost of UltraSmall is smaller than smallcore in the latest FPGA while achieving 1.8x performance of small-core.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Guo:2013:CAS, author = "Liucheng Guo and David B. Thomas and Wayne Luk", title = "Customisable architectures for the set covering problem", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "101--106", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641378", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper proposes novel customisable streaming architectures for the NP-hard set covering problem. Our approach covers both exhaustive and genetic algorithms, supporting coarse-grain parallelism and deep pipelines while allowing trade-offs between performance and resource usage. Experiments targeting Maxeler systems show that our FPGA-based designs are more effective than the corresponding multicore software versions. The speed up of the exhaustive algorithm exceeds 250 times, and that of the genetic algorithm exceeds 60 times. Meanwhile, our implementations are more flexible than other FPGA solutions, allowing users to customise parameters at run time without recompilation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Plumbridge:2013:BPR, author = "Gary Plumbridge and Jack Whitham and Neil Audsley", title = "{Blueshell}: a platform for rapid prototyping of multiprocessor {NoCs} and accelerators", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "107--117", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641379", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The rapid increase in FPGA logic capacity has enabled the prototyping of multiprocessor Network-on-Chip (NoC) architectures. However, the design space exploration of these complex architectures is highly time consuming with traditional methodologies for FPGA design. Our paper addresses the challenges of multiprocessor network design with the Blueshell framework for generating multiprocessor networks on chip (NoC) and a coupled Java software stack, Network-Chi. With Blueshell hardware is constructed from high-level components including processors and routers using concise Bluespec System Verilog. The Network-Chi software framework is also presented to enable programming the on-chip processors in a familiar Java style and without exposing the low-level systems programming to the application designer. We demonstrate that Blueshell systems with as many as 20 processors can be implemented on a modestly sized FPGA. Performance figures for a selection of distributed applications are also provided.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hong:2013:RTR, author = "Chuan Hong and Khaled Benkrid and Nazrin Isa and Xabier Iturbe", title = "A run-time reconfigurable system for adaptive high performance efficient computing", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "113--118", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641380", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Field programmable hardware gives electronic systems the ability to be reconfigured at run time. This allows electronic systems to be more efficiently customized on demand and on-the-fly depending on user requirements and environmental changes. This paper presents a run-time reconfigurable system that allows computing tasks to adjust their sizes in response to current available resources, optimizing the overall performance by maximally exploiting all the resources on the chip. In particular, we present a novel run-time task assembler, which assembles tasks with desired parameters on-the-fly, together with an efficacious run-time task placer to rapidly configure tasks at optimum locations. The system is demonstrated with a dynamic programming-based pairwise sequence alignment application. Real hardware implementation result shows that our run-time reconfigurable system optimizes resource usage on the fly by ~ 3x, while matching the performance of carefully hand-crafted static solutions.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2013:INc, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "41", number = "5", pages = "119--127", month = dec, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2641361.2641382", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:43 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Davis:2014:IWA, author = "Al Davis", title = "Inside {Windows Azure}: the challenges and opportunities of a cloud operating system", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "1--2", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2560008", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Cloud operating systems provide on-demand, scalable compute and storage resources. They allow service developers to focus on their business logic by simplifying many portions of their service, including resource management, provisioning, monitoring, and application lifecycle management. This talk describes some of the technical challenges faced, as well as emergent opportunities created, by Microsoft's cloud operating system Windows Azure.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Novakovic:2014:SN, author = "Stanko Novakovic and Alexandros Daglis and Edouard Bugnion and Babak Falsafi and Boris Grot", title = "Scale-out {NUMA}", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "3--18", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541965", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Emerging datacenter applications operate on vast datasets that are kept in DRAM to minimize latency. The large number of servers needed to accommodate this massive memory footprint requires frequent server-to-server communication in applications such as key-value stores and graph-based applications that rely on large irregular data structures. The fine-grained nature of the accesses is a poor match to commodity networking technologies, including RDMA, which incur delays of 10--1000$ \times $ over local DRAM operations. We introduce Scale-Out NUMA (soNUMA) --- an architecture, programming model, and communication protocol for low-latency, distributed in-memory processing. soNUMA layers an RDMA-inspired programming model directly on top of a NUMA memory fabric via a stateless messaging protocol. To facilitate interactions between the application, OS, and the fabric, soNUMA relies on the remote memory controller a new architecturally-exposed hardware block integrated into the node's local coherence hierarchy. Our results based on cycle-accurate full-system simulation show that soNUMA performs remote reads at latencies that are within 4$ \times $ of local DRAM, can fully utilize the available memory bandwidth, and can issue up to 10M remote memory operations per second per core.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Agrawal:2014:RHD, author = "Sandeep R. Agrawal and Valentin Pistol and Jun Pang and John Tran and David Tarjan and Alvin R. Lebeck", title = "{Rhythm}: harnessing data parallel hardware for server workloads", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "19--34", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541956", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Trends in increasing web traffic demand an increase in server throughput while preserving energy efficiency and total cost of ownership. Present work in optimizing data center efficiency primarily focuses on the data center as a whole, using off-the-shelf hardware for individual servers. Server capacity is typically increased by adding more machines, which is cheap, though inefficient in the long run in terms of energy and area. Our work builds on the observation that server workload execution patterns are not completely unique across multiple requests. We present a framework---called Rhythm---for high throughput servers that can exploit similarity across requests to improve server performance and power/energy efficiency by launching data parallel executions for request cohorts. An implementation of the SPECWeb Banking workload using Rhythm on NVIDIA GPUs provides a basis for evaluating both software and hardware for future cohort-based servers. Our evaluation of Rhythm on future server platforms shows that it achieves 4x the throughput (reqs/sec) of a core i7 at efficiencies (reqs/Joule) comparable to a dual core ARM Cortex A9. A Rhythm implementation that generates transposed responses achieves 8x the i7 throughput while processing 2.5x more requests/Joule compared to the A9.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Samadi:2014:PPB, author = "Mehrzad Samadi and Davoud Anoushe Jamshidi and Janghaeng Lee and Scott Mahlke", title = "{Paraprox}: pattern-based approximation for data parallel applications", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "35--50", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541948", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Approximate computing is an approach where reduced accuracy of results is traded off for increased speed, throughput, or both. Loss of accuracy is not permissible in all computing domains, but there are a growing number of data-intensive domains where the output of programs need not be perfectly correct to provide useful results or even noticeable differences to the end user. These soft domains include multimedia processing, machine learning, and data mining/analysis. An important challenge with approximate computing is transparency to insulate both software and hardware developers from the time, cost, and difficulty of using approximation. This paper proposes a software-only system, Paraprox, for realizing transparent approximation of data-parallel programs that operates on commodity hardware systems. Paraprox starts with a data-parallel kernel implemented using OpenCL or CUDA and creates a parameterized approximate kernel that is tuned at runtime to maximize performance subject to a target output quality (TOQ) that is supplied by the user. Approximate kernels are created by recognizing common computation idioms found in data-parallel programs (e.g., Map, Scatter/Gather, Reduction, Scan, Stencil, and Partition) and substituting approximate implementations in their place. Across a set of 13 soft data-parallel applications with at most 10\% quality degradation, Paraprox yields an average performance gain of 2.7x on a NVIDIA GTX 560 GPU and 2.5x on an Intel Core i7 quad-core processor compared to accurate execution on each platform.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Bornholt:2014:UFO, author = "James Bornholt and Todd Mytkowicz and Kathryn S. McKinley", title = "{Uncertain$<$ t$>$}: a first-order type for uncertain data", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "51--66", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541958", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Emerging applications increasingly use estimates such as sensor data (GPS), probabilistic models, machine learning, big data, and human data. Unfortunately, representing this uncertain data with discrete types (floats, integers, and booleans) encourages developers to pretend it is not probabilistic, which causes three types of uncertainty bugs. (1) Using estimates as facts ignores random error in estimates. (2) Computation compounds that error. (3) Boolean questions on probabilistic data induce false positives and negatives. This paper introduces Uncertain Whereas previous probabilistic programming languages focus on experts, Uncertain", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Santos:2014:UAT, author = "Nuno Santos and Himanshu Raj and Stefan Saroiu and Alec Wolman", title = "Using {ARM} trustzone to build a trusted language runtime for mobile applications", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "67--80", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541949", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents the design, implementation, and evaluation of the Trusted Language Runtime (TLR), a system that protects the confidentiality and integrity of .NET mobile applications from OS security breaches. TLR enables separating an application's security-sensitive logic from the rest of the application, and isolates it from the OS and other apps. TLR provides runtime support for the secure component based on a .NET implementation for embedded devices. TLR reduces the TCB of an open source .NET implementation by a factor of $ 78 $ with a tolerable performance cost. The main benefit of the TLR is to bring the developer benefits of managed code to trusted computing. With the TLR, developers can build their trusted components with the productivity benefits of modern high level languages, such as strong typing and garbage collection.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Criswell:2014:VGP, author = "John Criswell and Nathan Dautenhahn and Vikram Adve", title = "{Virtual Ghost}: protecting applications from hostile operating systems", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "81--96", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541986", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Applications that process sensitive data can be carefully designed and validated to be difficult to attack, but they are usually run on monolithic, commodity operating systems, which may be less secure. An OS compromise gives the attacker complete access to all of an application's data, regardless of how well the application is built. We propose a new system, Virtual Ghost, that protects applications from a compromised or even hostile OS. Virtual Ghost is the first system to do so by combining compiler instrumentation and run-time checks on operating system code, which it uses to create ghost memory that the operating system cannot read or write. Virtual Ghost interposes a thin hardware abstraction layer between the kernel and the hardware that provides a set of operations that the kernel must use to manipulate hardware, and provides a few trusted services for secure applications such as ghost memory management, encryption and signing services, and key management. Unlike previous solutions, Virtual Ghost does not use a higher privilege level than the kernel. Virtual Ghost performs well compared to previous approaches; it outperforms InkTag on five out of seven of the LMBench microbenchmarks with improvements between 1.3x and 14.3x. For network downloads, Virtual Ghost experiences a 45\% reduction in bandwidth at most for small files and nearly no reduction in bandwidth for large files and web traffic. An application we modified to use ghost memory shows a maximum additional overhead of 5\% due to the Virtual Ghost protections. We also demonstrate Virtual Ghost's efficacy by showing how it defeats sophisticated rootkit attacks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Li:2014:SLH, author = "Xun Li and Vineeth Kashyap and Jason K. Oberg and Mohit Tiwari and Vasanth Ram Rajarathinam and Ryan Kastner and Timothy Sherwood and Ben Hardekopf and Frederic T. Chong", title = "{Sapper}: a language for hardware-level security policy enforcement", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "97--112", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541947", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Privacy and integrity are important security concerns. These concerns are addressed by controlling information flow, i.e., restricting how information can flow through a system. Most proposed systems that restrict information flow make the implicit assumption that the hardware used by the system is fully ``correct'' and that the hardware's instruction set accurately describes its behavior in all circumstances. The truth is more complicated: modern hardware designs defy complete verification; many aspects of the timing and ordering of events are left totally unspecified; and implementation bugs present themselves with surprising frequency. In this work we describe Sapper, a novel hardware description language for designing security-critical hardware components. Sapper seeks to address these problems by using static analysis at compile-time to automatically insert dynamic checks in the resulting hardware that provably enforce a given information flow policy at execution time. We present Sapper's design and formal semantics along with a proof sketch of its security. In addition, we have implemented a compiler for Sapper and used it to create a non-trivial secure embedded processor with many modern microarchitectural features. We empirically evaluate the resulting hardware's area and energy overhead and compare them with alternative designs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Banabic:2014:FTM, author = "Radu Banabic and George Candea and Rachid Guerraoui", title = "Finding {Trojan} message vulnerabilities in distributed systems", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "113--126", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541984", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Trojan messages are messages that seem correct to the receiver but cannot be generated by any correct sender. Such messages constitute major vulnerability points of a distributed system---they constitute ideal targets for a malicious actor and facilitate failure propagation across nodes. We describe Achilles, a tool that searches for Trojan messages in a distributed system. Achilles uses dynamic white-box analysis on the distributed system binaries in order to infer the predicate that defines messages parsed by receiver nodes and generated by sender nodes, respectively, and then computes Trojan messages as the difference between the two. We apply Achilles on implementations of real distributed systems: FSP, a file transfer application, and PBFT, a Byzantine-fault-tolerant state machine replication library. Achilles discovered a new bug in FSP and rediscovered a previously known vulnerability in PBFT. In our evaluation we demonstrate that our approach can perform orders of magnitude better than general approaches based on regular fuzzing and symbolic execution.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Delimitrou:2014:QRE, author = "Christina Delimitrou and Christos Kozyrakis", title = "{Quasar}: resource-efficient and {QoS}-aware cluster management", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "127--144", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541941", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Cloud computing promises flexibility and high performance for users and high cost-efficiency for operators. Nevertheless, most cloud facilities operate at very low utilization, hurting both cost effectiveness and future scalability. We present Quasar, a cluster management system that increases resource utilization while providing consistently high application performance. Quasar employs three techniques. First, it does not rely on resource reservations, which lead to underutilization as users do not necessarily understand workload dynamics and physical resource requirements of complex codebases. Instead, users express performance constraints for each workload, letting Quasar determine the right amount of resources to meet these constraints at any point. Second, Quasar uses classification techniques to quickly and accurately determine the impact of the amount of resources (scale-out and scale-up), type of resources, and interference on performance for each workload and dataset. Third, it uses the classification results to jointly perform resource allocation and assignment, quickly exploring the large space of options for an efficient way to pack workloads on available resources. Quasar monitors workload performance and adjusts resource allocation and assignment when needed. We evaluate Quasar over a wide range of workload scenarios, including combinations of distributed analytics frameworks and low-latency, stateful services, both on a local cluster and a cluster of dedicated EC2 servers. At steady state, Quasar improves resource utilization by 47\% in the 200-server EC2 cluster, while meeting performance constraints for workloads of all types.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Zahedi:2014:RRE, author = "Seyed Majid Zahedi and Benjamin C. Lee", title = "{REF}: resource elasticity fairness with sharing incentives for multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "145--160", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541962", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With the democratization of cloud and datacenter computing, users increasingly share large hardware platforms. In this setting, architects encounter two challenges: sharing fairly and sharing multiple resources. Drawing on economic game-theory, we rethink fairness in computer architecture. A fair allocation must provide sharing incentives (SI), envy-freeness (EF), and Pareto efficiency (PE). We show that Cobb--Douglas utility functions are well suited to modeling user preferences for cache capacity and memory bandwidth. And we present an allocation mechanism that uses Cobb--Douglas preferences to determine each user's fair share of the hardware. This mechanism provably guarantees SI, EF, and PE, as well as strategy-proofness in the large (SPL). And it does so with modest performance penalties, less than 10\\% throughput loss, relative to an unfair mechanism.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Muthukaruppan:2014:PTB, author = "Thannirmalai Somu Muthukaruppan and Anuj Pathania and Tulika Mitra", title = "Price theory based power management for heterogeneous multi-cores", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "161--176", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541974", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Heterogeneous multi-cores that integrate cores with different power performance characteristics are promising alternatives to homogeneous systems in energy- and thermally constrained environments. However, the heterogeneity imposes significant challenges to power-aware scheduling. We present a price theory-based dynamic power management framework for heterogeneous multi-cores that co-ordinates various energy savings opportunities, such as dynamic voltage/frequency scaling, load balancing, and task migration in tandem, to achieve the best power-performance characteristics. Unlike existing centralized power management frameworks, ours is distributed and hence scalable with minimal runtime overhead. We design and implement the framework within Linux operating system on ARM big.LITTLE heterogeneous multi-core platform. Experimental evaluation confirms the advantages of our approach compared to the state-of-the-art techniques for power management in heterogeneous multi-cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Wang:2014:UBP, author = "Di Wang and Sriram Govindan and Anand Sivasubramaniam and Aman Kansal and Jie Liu and Badriddine Khessib", title = "Underprovisioning backup power infrastructure for datacenters", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "177--192", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541966", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "While there has been prior work to underprovision the power distribution infrastructure for a datacenter to save costs, the ability to underprovision the backup power infrastructure, which contributes significantly to capital costs, is little explored. There are two main components in the backup infrastructure --- Diesel Generators (DGs) and UPS units --- which can both be underprovisioned (or even removed) in terms of their power and/or energy capacities. However, embarking on such underprovisioning mandates studying several ramifications --- the resulting cost savings, the lower availability, and the performance and state loss consequences on individual applications --- concurrently. This paper presents the first such study, considering cost, availability, performance and application consequences of underprovisioning the backup power infrastructure. We present a framework to quantify the cost of backup capacity that is provisioned, and implement techniques leveraging existing software and hardware mechanisms to provide as seamless an operation as possible for an application within the provisioned backup capacity during a power outage. We evaluate the cost-performance-availability trade-offs for different levels of backup underprovisioning for applications with diverse reliance on the backup infrastructure. Our results show that one may be able to completely do away with DGs, compensating for it with additional UPS energy capacities, to significantly cut costs and still be able to handle power outages lasting as high as 40 minutes (which constitute bulk of the outages). Further, we can push the limits of outage duration that can be handled in a cost-effective manner, if applications are willing to tolerate degraded performance during the outage. Our evaluations also show that different applications react differently to the outage handling mechanisms, and that the efficacy of the mechanisms is sensitive to the outage duration. The insights from this paper can spur new opportunities for future work on backup power infrastructure optimization.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Yu:2014:CPR, author = "Xiao Yu and Shi Han and Dongmei Zhang and Tao Xie", title = "Comprehending performance from real-world execution traces: a device-driver case", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "193--206", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541968", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Real-world execution traces record performance problems that are likely perceived at deployment sites. However, those problems can be rooted subtly and deeply into system layers or other components far from the place where delays are initially observed. To tackle challenges of identifying deeply rooted problems, we propose a new trace-based approach consisting of two steps: impact analysis and causality analysis. The impact analysis measures performance impacts on a component basis, and the causality analysis discovers patterns of runtime behaviors that are likely to cause the measured impacts. The discovered patterns can help performance analysts quickly identify root causes of perceived performance problems. We instantiate our approach to study the performance of device drivers on over 19,500 real-world execution traces. The impact analysis shows that device drivers constitute a non-trivial part ($ \approx 38$) in the overall system performance, and a big part ($ \approx 26$) is due to interactions between drivers. The causality analysis effectively discovers highly suspicious and high-impact behavioral patterns in device drivers, examined and confirmed by our automated evaluation, developers, and performance analysts.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Arulraj:2014:LST, author = "Joy Arulraj and Guoliang Jin and Shan Lu", title = "Leveraging the short-term memory of hardware to diagnose production-run software failures", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "207--222", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541973", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Failures caused by software bugs are widespread in production runs, causing severe losses for end users. Unfortunately, diagnosing production-run failures is challenging. Existing work cannot satisfy privacy, run-time overhead, diagnosis capability, and diagnosis latency requirements all at once. This paper designs a low overhead, low latency, privacy preserving production-run failure diagnosis system based on two observations. First, short-term memory of program execution is often sufficient for failure diagnosis, as many bugs have short propagation distances. Second, maintaining a short-term memory of execution is much cheaper than maintaining a record of the whole execution. Following these observations, we first identify an existing hardware unit, Last Branch Record (LBR), that records the last few taken branches to help diagnose sequential bugs. We then propose a simple hardware extension, Last Cache-coherence Record (LCR), to record the last few cache accesses with specified coherence states and hence help diagnose concurrency bugs. Finally, we design LBRA and LCRA to automatically locate failure root causes using LBR and LCR. Our evaluation uses 31 real-world sequential and concurrency bug failures from 18 representative open-source software. The results show that with just 16 record entries, LBR and LCR enable our system to automatically locate the root causes for 27 out of 31 failures, with less than 3\% run-time overhead. As our system does not rely on sampling, \ldots{}", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Honarmand:2014:RRR, author = "Nima Honarmand and Josep Torrellas", title = "{RelaxReplay}: record and replay for relaxed-consistency multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "223--238", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541979", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Record and Deterministic Replay (RnR) of multithreaded programs on relaxed-consistency multiprocessors has been a long-standing problem. While there are designs that work for Total Store Ordering (TSO), finding a general solution that is able to record the access reordering allowed by any relaxed-consistency model has proved challenging. This paper presents the first complete solution for hard-ware-assisted memory race recording that works for any relaxed-consistency model of current processors. With the scheme, called RelaxReplay, we can build an RnR system for any relaxed-consistency model and coherence protocol. RelaxReplay's core innovation is a new way of capturing memory access reordering. Each memory instruction goes through a post-completion in-order counting step that detects any reordering, and efficiently records it. We evaluate RelaxReplay with simulations of an 8-core release-consistent multicore running SPLASH-2 programs. We observe that RelaxReplay induces negligible overhead during recording. In addition, the average size of the log produced is comparable to the log sizes reported for existing solutions, and still very small compared to the memory bandwidth of modern machines. Finally, deterministic replay is efficient and needs minimal hardware support.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Bucur:2014:PSE, author = "Stefan Bucur and Johannes Kinder and George Candea", title = "Prototyping symbolic execution engines for interpreted languages", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "239--254", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541977", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Symbolic execution is being successfully used to automatically test statically compiled code. However, increasingly more systems and applications are written in dynamic interpreted languages like Python. Building a new symbolic execution engine is a monumental effort, and so is keeping it up-to-date as the target language evolves. Furthermore, ambiguous language specifications lead to their implementation in a symbolic execution engine potentially differing from the production interpreter in subtle ways. We address these challenges by flipping the problem and using the interpreter itself as a specification of the language semantics. We present a recipe and tool (called Chef) for turning a vanilla interpreter into a sound and complete symbolic execution engine. Chef symbolically executes the target program by symbolically executing the interpreter's binary while exploiting inferred knowledge about the program's high-level structure. Using Chef, we developed a symbolic execution engine for Python in 5 person-days and one for Lua in 3 person-days. They offer complete and faithful coverage of language features in a way that keeps up with future language versions at near-zero cost. Chef-produced engines are up to 1000 times more performant than if directly executing the interpreter symbolically without Chef.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Wu:2014:QAD, author = "Lisa Wu and Andrea Lottarini and Timothy K. Paine and Martha A. Kim and Kenneth A. Ross", title = "{Q100}: the architecture and design of a database processing unit", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "255--268", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541961", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper, we propose Database Processing Units, or DPUs, a class of domain-specific database processors that can efficiently handle database applications. As a proof of concept, we present the instruction set architecture, microarchitecture, and hardware implementation of one DPU, called Q100. The Q100 has a collection of heterogeneous ASIC tiles that process relational tables and columns quickly and energy-efficiently. The architecture uses coarse grained instructions that manipulate streams of data, thereby maximizing pipeline and data parallelism, and minimizing the need to time multiplex the accelerator tiles and spill inter- mediate results to memory. This work explores a Q100 de- sign space of 150 configurations, selecting three for further analysis: a small, power-conscious implementation, a high- performance implementation, and a balanced design that maximizes performance per Watt. We then demonstrate that the power-conscious Q100 handles the TPC-H queries with three orders of magnitude less energy than a state of the art software DBMS, while the performance-oriented design out- performs the same DBMS by 70X.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Chen:2014:DSF, author = "Tianshi Chen and Zidong Du and Ninghui Sun and Jia Wang and Chengyong Wu and Yunji Chen and Olivier Temam", title = "{DianNao}: a small-footprint high-throughput accelerator for ubiquitous machine-learning", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "269--284", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541967", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Machine-Learning tasks are becoming pervasive in a broad range of domains, and in a broad range of systems (from embedded systems to data centers). At the same time, a small set of machine-learning algorithms (especially Convolutional and Deep Neural Networks, i.e., CNNs and DNNs) are proving to be state-of-the-art across many applications. As architectures evolve towards heterogeneous multi-cores composed of a mix of cores and accelerators, a machine-learning accelerator can achieve the rare combination of efficiency (due to the small number of target algorithms) and broad application scope. Until now, most machine-learning accelerator designs have focused on efficiently implementing the computational part of the algorithms. However, recent state-of-the-art CNNs and DNNs are characterized by their large size. In this study, we design an accelerator for large-scale CNNs and DNNs, with a special emphasis on the impact of memory on accelerator design, performance and energy. We show that it is possible to design an accelerator with a high throughput, capable of performing 452 GOP/s (key NN operations such as synaptic weight multiplications and neurons outputs additions) in a small footprint of 3.02 mm2 and 485 mW; compared to a 128-bit 2GHz SIMD processor, the accelerator is 117.87x faster, and it can reduce the total energy by 21.08x. The accelerator characteristics are obtained after layout at 65 nm. Such a high throughput in a small footprint can open up the usage of state-of-the-art machine-learning algorithms in a broad set of systems and for a broad set of applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Lin:2014:KMO, author = "Felix Xiaozhu Lin and Zhen Wang and Lin Zhong", title = "{K2}: a mobile operating system for heterogeneous coherence domains", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "285--300", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541975", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Mobile System-on-Chips (SoC) that incorporate heterogeneous coherence domains promise high energy efficiency to a wide range of mobile applications, yet are difficult to program. To exploit the architecture, a desirable, yet missing capability is to replicate operating system (OS) services over multiple coherence domains with minimum inter-domain communication. In designing such an OS, we set three goals: to ease application development, to simplify OS engineering, and to preserve the current OS performance. To this end, we identify a shared-most OS model for multiple coherence domains: creating per-domain instances of core OS services with no shared state, while enabling other extended OS services to share state across domains. To test the model, we build K2, a prototype OS on the TI OMAP4 SoC, by reusing most of the Linux 3.4 source. K2 presents a single system image to applications with its two kernels running on top of the two coherence domains of OMAP4. The two kernels have independent instances of core OS services, such as page allocator and interrupt management, as coordinated by K2; the two kernels share most extended OS services, such as device drivers, whose state is kept coherent transparently by K2. Despite platform constraints and unoptimized code, K2 improves energy efficiency for light OS workloads by 8x-10x, while incurring less than 6\% performance overhead for a device driver shared between kernels. Our experiences with K2 show that the shared-most model is promising.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Menychtas:2014:DSF, author = "Konstantinos Menychtas and Kai Shen and Michael L. Scott", title = "Disengaged scheduling for fair, protected access to fast computational accelerators", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "301--316", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541963", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Today's operating systems treat GPUs and other computational accelerators as if they were simple devices, with bounded and predictable response times. With accelerators assuming an increasing share of the workload on modern machines, this strategy is already problematic, and likely to become untenable soon. If the operating system is to enforce fair sharing of the machine, it must assume responsibility for accelerator scheduling and resource management. Fair, safe scheduling is a particular challenge on fast accelerators, which allow applications to avoid kernel-crossing overhead by interacting directly with the device. We propose a disengaged scheduling strategy in which the kernel intercedes between applications and the accelerator on an infrequent basis, to monitor their use of accelerator cycles and to determine which applications should be granted access over the next time interval. Our strategy assumes a well defined, narrow interface exported by the accelerator. We build upon such an interface, systematically inferred for the latest Nvidia GPUs. We construct several example schedulers, including Disengaged Timeslice with overuse control that guarantees fairness and Disengaged Fair Queueing that is effective in limiting resource idleness, but probabilistic. Both schedulers ensure fair sharing of the GPU, even among uncooperative or adversarial applications; Disengaged Fair Queueing incurs a 4\% overhead on average (max 18\%) compared to direct device access across our evaluation scenarios.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Gehlhaar:2014:NPN, author = "Jeff Gehlhaar", title = "Neuromorphic processing: a new frontier in scaling computer architecture", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "317--318", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2564710", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The desire to build a computer that operates in the same manner as our brains is as old as the computer itself. Although computer engineering has made great strides in hardware performance as a result of Dennard scaling, and even great advances in 'brain like' computation, the field still struggles to move beyond sequential, analytical computing architectures. Neuromorphic systems are being developed to transcend the barriers imposed by silicon power consumption, develop new algorithms that help machines achieve cognitive behaviors, and both exploit and enable further research in neuroscience. In this talk I will discuss a system implementing spiking neural networks. These systems hold the promise of an architecture that is event based, broad and shallow, and thus more power efficient than conventional computing solutions. This new approach to computation based on modeling the brain and its simple but highly connected units presents a host of new challenges. Hardware faces tradeoffs such as density or lower power at the cost of high interconnection overhead. Consequently, software systems must face choices about new language design. Highly distributed hardware systems require complex place and route algorithms to distribute the execution of the neural network across a large number of highly interconnected processing units. Finally, the overall design, simulation and testing process has to be entirely reimagined. We discuss these issues in the context of the Zeroth processor and how this approach compares to other neuromorphic systems that are becoming available.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Sani:2014:PDF, author = "Ardalan Amiri Sani and Kevin Boos and Shaopu Qin and Lin Zhong", title = "{I/O} paravirtualization at the device file boundary", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "319--332", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541943", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Paravirtualization is an important I/O virtualization technology since it uniquely provides all of the following benefits: the ability to share the device between multiple VMs, support for legacy devices without virtualization hardware, and high performance. However, existing paravirtualization solutions have one main limitation: they only support one I/O device class, and would require significant engineering effort to support new device classes and features. In this paper, we present Paradice, a solution that vastly simplifies I/O paravirtualization by using a common paravirtualization boundary for various I/O device classes: Unix device files. Using this boundary, the paravirtual drivers simply act as a class-agnostic indirection layer between the application and the actual device driver. We address two fundamental challenges: supporting cross-VM driver memory operations without changes to applications or device drivers and providing fault and device data isolation between guest VMs despite device driver bugs. We implement Paradice for x86, the Xen hypervisor, and the Linux and FreeBSD OSes. Our implementation paravirtualizes various GPUs, input devices, cameras, an audio device, and an Ethernet card for the netmap framework with ~7700 LoC, of which only ~900 are device class-specific. Our measurements show that Paradice achieves performance close to native for different devices and applications including netmap, 3D HD games, and OpenCL applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Dall:2014:KAD, author = "Christoffer Dall and Jason Nieh", title = "{{KVM\slash} ARM}: the design and implementation of the {Linux ARM} hypervisor", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "333--348", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541946", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As ARM CPUs become increasingly common in mobile devices and servers, there is a growing demand for providing the benefits of virtualization for ARM-based devices. We present our experiences building the Linux ARM hypervisor, KVM/ARM, the first full system ARM virtualization solution that can run unmodified guest operating systems on ARM multicore hardware. KVM/ARM introduces split-mode virtualization, allowing a hypervisor to split its execution across CPU modes and be integrated into the Linux kernel. This allows KVM/ARM to leverage existing Linux hardware support and functionality to simplify hypervisor development and maintainability while utilizing recent ARM hardware virtualization extensions to run virtual machines with comparable performance to native execution. KVM/ARM has been successfully merged into the mainline Linux kernel, ensuring that it will gain wide adoption as the virtualization platform of choice for ARM. We provide the first measurements on real hardware of a complete hypervisor using ARM hardware virtualization support. Our results demonstrate that KVM/ARM has modest virtualization performance and power costs, and can achieve lower performance and power costs compared to x86-based Linux virtualization on multicore hardware.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Amit:2014:VMS, author = "Nadav Amit and Dan Tsafrir and Assaf Schuster", title = "{VSwapper}: a memory swapper for virtualized environments", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "349--366", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541969", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The number of guest virtual machines that can be consolidated on one physical host is typically limited by the memory size, motivating memory overcommitment. Guests are given a choice to either install a ``balloon'' driver to coordinate the overcommitment activity, or to experience degraded performance due to uncooperative swapping. Ballooning, however, is not a complete solution, as hosts must still fall back on uncooperative swapping in various circumstances. Additionally, ballooning takes time to accommodate change, and so guests might experience degraded performance under changing conditions. Our goal is to improve the performance of hosts when they fall back on uncooperative swapping and/or operate under changing load conditions. We carefully isolate and characterize the causes for the associated poor performance, which include various types of superfluous swap operations, decayed swap file sequentiality, and ineffective prefetch decisions upon page faults. We address these problems by implementing VSwapper, a guest-agnostic memory swapper for virtual environments that allows efficient, uncooperative overcommitment. With inactive ballooning, VSwapper yields up to an order of magnitude performance improvement. Combined with ballooning, VSwapper can achieve up to double the performance under changing load conditions.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Andrus:2014:CNE, author = "Jeremy Andrus and Alexander Van't Hof and Naser AlDuaij and Christoffer Dall and Nicolas Viennot and Jason Nieh", title = "{Cider}: native execution of {iOS} apps on {Android}", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "367--382", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541972", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We present Cider, an operating system compatibility architecture that can run applications built for different mobile ecosystems, iOS or Android, together on the same smartphone or tablet. Cider enhances the domestic operating system, Android, of a device with kernel-managed, per-thread personas to mimic the application binary interface of a foreign operating system, iOS, enabling it to run unmodified foreign binaries. This is accomplished using a novel combination of binary compatibility techniques including two new mechanisms: compile-time code adaptation, and diplomatic functions. Compile-time code adaptation enables existing unmodified foreign source code to be reused in the domestic kernel, reducing implementation effort required to support multiple binary interfaces for executing domestic and foreign applications. Diplomatic functions leverage per-thread personas, and allow foreign applications to use domestic libraries to access proprietary software and hardware interfaces. We have built a Cider prototype, and demonstrate that it imposes modest performance overhead and runs unmodified iOS and Android applications together on a Google Nexus tablet running the latest version of Android.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Litz:2014:STR, author = "Heiner Litz and David Cheriton and Amin Firoozshahian and Omid Azizi and John P. Stevenson", title = "{SI-TM}: reducing transactional memory abort rates through snapshot isolation", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "383--398", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541952", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Transactional memory represents an attractive conceptual model for programming concurrent applications. Unfortunately, high transaction abort rates can cause significant performance degradation. Conventional transactional memory realizations not only pessimistically abort transactions on every read-write conflict but also because of false sharing, cache evictions, TLB misses, page faults and interrupts. Consequently, the use of transactions needs to be restricted to a very small number of operations to achieve predictable performance, thereby, limiting its benefit to programming simplification. In this paper, we investigate snapshot isolation transactional memory in which transactions operate on memory snapshots that always guarantee consistent reads. By exploiting snapshots, an established database model of transactions, transactions can ignore read-write conflicts and only need to abort on write-write conflicts. Our implementation utilizes a memory controller that supports multiversion memory, to efficiently support snapshotting in hardware.We show that snapshot isolation can reduce the number of aborts in some cases by three orders of magnitude and improve performance by up to 20x.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Ruan:2014:TLC, author = "Wenjia Ruan and Trilok Vyas and Yujie Liu and Michael Spear", title = "Transactionalizing legacy code: an experience report using {GCC} and {Memcached}", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "399--412", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541960", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/gnu.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The addition of transactional memory (TM) support to existing languages provides the opportunity to create new software from scratch using transactions, and also to simplify or extend legacy code by replacing existing synchronization with language-level transactions. In this paper, we describe our experiences transactionalizing the memcached application through the use of the GCC implementation of the Draft C++ TM Specification. We present experiences and recommendations that we hope will guide the effort to integrate TM into languages, and that may also contribute to the growing collective knowledge about how programmers can begin to exploit TM in existing production-quality software.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Morrison:2014:FFW, author = "Adam Morrison and Yehuda Afek", title = "Fence-free work stealing on bounded {TSO} processors", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "413--426", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541987", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Work stealing is the method of choice for load balancing in task parallel programming languages and frameworks. Yet despite considerable effort invested in optimizing work stealing task queues, existing algorithms issue a costly memory fence when removing a task, and these fences are believed to be necessary for correctness. This paper refutes this belief, demonstrating work stealing algorithms in which a worker does not issue a memory fence for microarchitectures with a bounded total store ordering (TSO) memory model. Bounded TSO is a novel restriction of TSO --- capturing mainstream x86 and SPARC TSO processors --- that bounds the number of stores a load can be reordered with. Our algorithms eliminate the memory fence penalty, improving the running time of a suite of parallel benchmarks on modern x86 multicore processors by 7\%-11\% on average (and up to 23\%), compared to the Cilk and Chase--Lev work stealing queues.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Hower:2014:HRF, author = "Derek R. Hower and Blake A. Hechtman and Bradford M. Beckmann and Benedict R. Gaster and Mark D. Hill and Steven K. Reinhardt and David A. Wood", title = "Heterogeneous-race-free memory models", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "427--440", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541981", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Commodity heterogeneous systems (e.g., integrated CPUs and GPUs), now support a unified, shared memory address space for all components. Because the latency of global communication in a heterogeneous system can be prohibitively high, heterogeneous systems (unlike homogeneous CPU systems) provide synchronization mechanisms that only guarantee ordering among a subset of threads, which we call a scope. Unfortunately, the consequences and semantics of these scoped operations are not yet well understood. Without a formal and approachable model to reason about the behavior of these operations, we risk an array of portability and performance issues. In this paper, we embrace scoped synchronization with a new class of memory consistency models that add scoped synchronization to data-race-free models like those of C++ and Java. Called sequential consistency for heterogeneous-race-free (SC for HRF), the new models guarantee SC for programs with ``sufficient'' synchronization (no data races) of ``sufficient'' scope. We discuss two such models. The first, HRF-direct, works well for programs with highly regular parallelism. The second, HRF-indirect, builds on HRF-direct by allowing synchronization using different scopes in some cases involving transitive communication. We quantitatively show that HRF-indirect encourages forward-looking programs with irregular parallelism by showing up to a 10\% performance increase in a task runtime for GPUs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Jung:2014:TNS, author = "Myoungsoo Jung and Wonil Choi and John Shalf and Mahmut Taylan Kandemir", title = "{Triple-A}: a Non-{SSD} based autonomic all-flash array for high performance storage systems", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "441--454", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541953", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Solid State Disk (SSD) arrays are in a position to (as least partially) replace spinning disk arrays in high performance computing (HPC) systems due to their better performance and lower power consumption. However, these emerging SSD arrays are facing enormous challenges, which are not observed in disk-based arrays. Specifically, we observe that the performance of SSD arrays can significantly degrade due to various array-level resource contentions. In addition, their maintenance costs exponentially increase over time, which renders them difficult to deploy widely in HPC systems. To address these challenges, we propose Triple-A, a non-SSD based Autonomic All-Flash Array, which is a self-optimizing, from-scratch NAND flash cluster. Triple-A can detect two different types of resource contentions and autonomically alleviate them by reshaping the physical data-layout on its flash array network. Our experimental evaluation using both real workloads and a micro-benchmark show that Triple-A can offer a 53\% higher sustained throughput and a 80\% lower I/O latency than non-autonomic SSD arrays.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Liu:2014:NDU, author = "Ren-Shuo Liu and De-Yu Shen and Chia-Lin Yang and Shun-Chih Yu and Cheng-Yuan Michael Wang", title = "{NVM} duet: unified working memory and persistent store architecture", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "455--470", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541957", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Emerging non-volatile memory (NVM) technologies have gained a lot of attention recently. The byte-addressability and high density of NVM enable computer architects to build large-scale main memory systems. NVM has also been shown to be a promising alternative to conventional persistent store. With NVM, programmers can persistently retain in-memory data structures without writing them to disk. Therefore, one can envision that in the future, NVM will play the role of both working memory and persistent store at the same time. Persistent store demands consistency and durability guarantees, thereby imposing new design constraints on the memory system. Consistency is achieved at the expense of serializing multiple write operations. Durability requires memory cells to guarantee non-volatility and thus reduces the write speed. Therefore, a unified architecture oblivious to these two use cases would lead to suboptimal design. In this paper, we propose a novel unified working memory and persistent store architecture, NVM Duet, which provides the required consistency and durability guarantees for persistent store while relaxing these constraints if accesses to NVM are for working memory. A cross-layer design approach is adopted to achieve the design goal. Overall, simulation results demonstrate that NVM Duet achieves up to 1.68x (1.32x on average) speedup compared with the baseline design.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Ouyang:2014:SSD, author = "Jian Ouyang and Shiding Lin and Song Jiang and Zhenyu Hou and Yong Wang and Yuanzheng Wang", title = "{SDF}: software-defined flash for {Web}-scale {Internet} storage systems", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "471--484", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541959", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In the last several years hundreds of thousands of SSDs have been deployed in the data centers of Baidu, China's largest Internet search company. Currently only 40\\% or less of the raw bandwidth of the flash memory in the SSDs is delivered by the storage system to the applications. Moreover, because of space over-provisioning in the SSD to accommodate non-sequential or random writes, and additionally, parity coding across flash channels, typically only 50-70\\% of the raw capacity of a commodity SSD can be used for user data. Given the large scale of Baidu's data center, making the most effective use of its SSDs is of great importance. Specifically, we seek to maximize both bandwidth and usable capacity. To achieve this goal we propose {\em software-defined flash} (SDF), a hardware/software co-designed storage system to maximally exploit the performance characteristics of flash memory in the context of our workloads. SDF exposes individual flash channels to the host software and eliminates space over-provisioning. The host software, given direct access to the raw flash channels of the SSD, can effectively organize its data and schedule its data access to better realize the SSD's raw performance potential. Currently more than 3000 SDFs have been deployed in Baidu's storage system that supports its web page and image repository services. Our measurements show that SDF can deliver approximately 95\% of the raw flash bandwidth and provide 99\% of the flash capacity for user data. SDF increases I/O bandwidth by 300\\% and reduces per-GB hardware cost by 50\% on average compared with the commodity-SSD-based system used at Baidu.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Gutierrez:2014:ISS, author = "Anthony Gutierrez and Michael Cieslak and Bharan Giridhar and Ronald G. Dreslinski and Luis Ceze and Trevor Mudge", title = "Integrated {$3$D}-stacked server designs for increasing physical density of key-value stores", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "485--498", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541951", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Key-value stores, such as Memcached, have been used to scale web services since the beginning of the Web 2.0 era. Data center real estate is expensive, and several industry experts we have spoken to have suggested that a significant portion of their data center space is devoted to key value stores. Despite its wide-spread use, there is little in the way of hardware specialization for increasing the efficiency and density of Memcached; it is currently deployed on commodity servers that contain high-end CPUs designed to extract as much instruction-level parallelism as possible. Out-of-order CPUs, however have been shown to be inefficient when running Memcached. To address Memcached efficiency issues, we propose two architectures using 3D stacking to increase data storage efficiency. Our first 3D architecture, Mercury, consists of stacks of ARM Cortex-A7 cores with 4GB of DRAM, as well as NICs. Our second architecture, Iridium, replaces DRAM with NAND Flash to improve density. We explore, through simulation, the potential efficiency benefits of running Memcached on servers that use 3D-stacking to closely integrate low-power CPUs with NICs and memory. With Mercury we demonstrate that density may be improved by 2.9X, power efficiency by 4.9X, throughput by 10X, and throughput per GB by 3.5X over a state-of-the-art server running optimized Memcached. With Iridium we show that density may be increased by 14X, power efficiency by 2.4X, and throughput by 5.2X, while still meeting latency requirements for a majority of requests.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Nguyen:2014:DGD, author = "Donald Nguyen and Andrew Lenharth and Keshav Pingali", title = "Deterministic {Galois}: on-demand, portable and parameterless", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "499--512", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541964", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Non-determinism in program execution can make program development and debugging difficult. In this paper, we argue that solutions to this problem should be on-demand, portable and parameterless. On-demand means that the programming model should permit the writing of non-deterministic programs since these programs often perform better than deterministic ones for the same problem. Portable means that the program should produce the same answer even if it is run on different machines. Parameterless means that if there are machine-dependent scheduling parameters that must be tuned for good performance, they must not affect the output. Although many solutions for deterministic program execution have been proposed in the literature, they fall short along one or more of these dimensions. To remedy this, we propose a new approach, based on the Galois programming model, in which (i) the programming model permits the writing of non-deterministic programs and (ii) the runtime system executes these programs deterministically if needed. Evaluation of this approach on a collection of benchmarks from the PARSEC, PBBS, and Lonestar suites shows that it delivers deterministic execution with substantially less overhead than other systems in the literature.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Ribic:2014:EEW, author = "Haris Ribic and Yu David Liu", title = "Energy-efficient work-stealing language runtimes", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "513--528", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541971", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Work stealing is a promising approach to constructing multithreaded program runtimes of parallel programming languages. This paper presents HERMES, an energy-efficient work-stealing language runtime. The key insight is that threads in a work-stealing environment --- thieves and victims --- have varying impacts on the overall program running time, and a coordination of their execution ``tempo'' can lead to energy efficiency with minimal performance loss. The centerpiece of HERMES is two complementary algorithms to coordinate thread tempo: the workpath-sensitive algorithm determines tempo for each thread based on thief-victim relationships on the execution path, whereas the workload-sensitive algorithm selects appropriate tempo based on the size of work-stealing deques. We construct HERMES on top of Intel Cilk Plus's runtime, and implement tempo adjustment through standard Dynamic Voltage and Frequency Scaling (DVFS). Benchmarks running on HERMES demonstrate an average of 11-12\% energy savings with an average of 3-4\% performance loss through meter-based measurements over commercial CPUs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Mytkowicz:2014:DPF, author = "Todd Mytkowicz and Madanlal Musuvathi and Wolfram Schulte", title = "Data-parallel finite-state machines", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "529--542", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541988", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A finite-state machine (FSM) is an important abstraction for solving several problems, including regular-expression matching, tokenizing text, and Huffman decoding. FSM computations typically involve data-dependent iterations with unpredictable memory-access patterns making them difficult to parallelize. This paper describes a parallel algorithm for FSMs that breaks dependences across iterations by efficiently enumerating transitions from all possible states on each input symbol. This allows the algorithm to utilize various sources of data parallelism available on modern hardware, including vector instructions and multiple processors/cores. For instance, on benchmarks from three FSM applications: regular expressions, Huffman decoding, and HTML tokenization, the parallel algorithm achieves up to a 3x speedup over optimized sequential baselines on a single core, and linear speedups up to 21x on 8 cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Zhao:2014:CES, author = "Zhijia Zhao and Bo Wu and Xipeng Shen", title = "Challenging the {``embarrassingly sequential''}: parallelizing finite state machine-based computations through principled speculation", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "543--558", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541989", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Finite-State Machine (FSM) applications are important for many domains. But FSM computation is inherently sequential, making such applications notoriously difficult to parallelize. Most prior methods address the problem through speculations on simple heuristics, offering limited applicability and inconsistent speedups. This paper provides some principled understanding of FSM parallelization, and offers the first disciplined way to exploit application-specific information to inform speculations for parallelization. Through a series of rigorous analysis, it presents a probabilistic model that captures the relations between speculative executions and the properties of the target FSM and its inputs. With the formulation, it proposes two model-based speculation schemes that automatically customize themselves with the suitable configurations to maximize the parallelization benefits. This rigorous treatment yields near-linear speedup on applications that state-of-the-art techniques can barely accelerate.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Zhou:2014:SAS, author = "Yanqi Zhou and David Wentzlaff", title = "The sharing architecture: sub-core configurability for {IaaS} clouds", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "559--574", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541950", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Businesses and Academics are increasingly turning to Infrastructure as a Service (IaaS) Clouds such as Amazon's Elastic Compute Cloud (EC2) to fulfill their computing needs. Unfortunately, current IaaS systems provide a severely restricted pallet of rentable computing options which do not optimally fit the workloads that they are executing. We address this challenge by proposing and evaluating a manycore architecture, called the Sharing Architecture, specifically optimized for IaaS systems by being reconfigurable on a sub-core basis. The Sharing Architecture enables better matching of workload to micro-architecture resources by replacing static cores with Virtual Cores which can be dynamically reconfigured to have different numbers of ALUs and amount of Cache. This reconfigurability enables many of the same benefits of heterogeneous multicores, but in a homogeneous fabric, and enables the reuse and resale of resources on a per ALU or per KB of cache basis. The Sharing Architecture leverages Distributed ILP techniques, but is designed in a way to be independent of recompilation. In addition, we introduce an economic model which is enabled by the Sharing Architecture and show how different users who have varying needs can be better served by such a flexible architecture. We evaluate the Sharing Architecture across a benchmark suite of Apache, SPECint, and parts of PARSEC, and find that it can achieve up to a 5x more economically efficient market when compared to static architecture multicores. We implemented the Sharing Architecture in Verilog and present area overhead results.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Waterland:2014:AAS, author = "Amos Waterland and Elaine Angelino and Ryan P. Adams and Jonathan Appavoo and Margo Seltzer", title = "{ASC}: automatically scalable computation", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "575--590", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541985", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We present an architecture designed to transparently and automatically scale the performance of sequential programs as a function of the hardware resources available. The architecture is predicated on a model of computation that views program execution as a walk through the enormous state space composed of the memory and registers of a single-threaded processor. Each instruction execution in this model moves the system from its current point in state space to a deterministic subsequent point. We can parallelize such execution by predictively partitioning the complete path and speculatively executing each partition in parallel. Accurately partitioning the path is a challenging prediction problem. We have implemented our system using a functional simulator that emulates the x86 instruction set, including a collection of state predictors and a mechanism for speculatively executing threads that explore potential states along the execution path. While the overhead of our simulation makes it impractical to measure speedup relative to native x86 execution, experiments on three benchmarks show scalability of up to a factor of 256 on a 1024 core machine when executing unmodified sequential programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Eyerman:2014:BSM, author = "Stijn Eyerman and Lieven Eeckhout", title = "The benefit of {SMT} in the multi-core era: flexibility towards degrees of thread-level parallelism", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "591--606", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541954", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The number of active threads in a multi-core processor varies over time and is often much smaller than the number of supported hardware threads. This requires multi-core chip designs to balance core count and per-core performance. Low active thread counts benefit from a few big, high-performance cores, while high active thread counts benefit more from a sea of small, energy-efficient cores. This paper comprehensively studies the trade-offs in multi-core design given dynamically varying active thread counts. We find that, under these workload conditions, a homogeneous multi-core processor, consisting of a few high-performance SMT cores, typically outperforms heterogeneous multi-cores consisting of a mix of big and small cores (without SMT), within the same power budget. We also show that a homogeneous multi-core performs almost as well as a heterogeneous multi-core that also implements SMT, as well as a dynamic multi-core, while being less complex to design and verify. Further, heterogeneous multi-cores that power-gate idle cores yield (only) slightly better energy-efficiency compared to homogeneous multi-cores. The overall conclusion is that the benefit of SMT in the multi-core era is to provide flexibility with respect to the available thread-level parallelism. Consequently, homogeneous multi-cores with big SMT cores are competitive high-performance, energy-efficient design points for workloads with dynamically varying active thread counts.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Ding:2014:FLE, author = "Yufei Ding and Mingzhou Zhou and Zhijia Zhao and Sarah Eisenstat and Xipeng Shen", title = "Finding the limit: examining the potential and complexity of compilation scheduling for {JIT}-based runtime systems", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "607--622", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541945", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This work aims to find out the full potential of compilation scheduling for JIT-based runtime systems. Compilation scheduling determines the order in which the compilation units (e.g., functions) in a program are to be compiled or recompiled. It decides when what versions of the units are ready to run, and hence affects performance. But it has been a largely overlooked direction in JIT-related research, with some fundamental questions left open: How significant compilation scheduling is for performance, how good the scheduling schemes employed by existing runtime systems are, and whether a great potential exists for improvement. This study proves the strong NP-completeness of the problem, proposes a heuristic algorithm that yields near optimal schedules, examines the potential of two current scheduling schemes empirically, and explores the relations with JIT designs. It provides the first principled understanding to the complexity and potential of compilation scheduling, shedding some insights for JIT-based runtime system improvement.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Lupon:2014:SHS, author = "Marc Lupon and Enric Gibert and Grigorios Magklis and Sridhar Samudrala and Ra{\'u}l Mart{\'\i}nez and Kyriakos Stavrou and David R. Ditzel", title = "Speculative hardware\slash software co-designed floating-point multiply-add fusion", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "623--638", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541978", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A Fused Multiply-Add (FMA) instruction is currently available in many general-purpose processors. It increases performance by reducing latency of dependent operations and increases precision by computing the result as an indivisible operation with no intermediate rounding. However, since the arithmetic behavior of a single-rounding FMA operation is different than independent FP multiply followed by FP add instructions, some algorithms require significant revalidation and rewriting efforts to work as expected when they are compiled to operate with FMA --- a cost that developers may not be willing to pay. Because of that, abundant legacy applications are not able to utilize FMA instructions. In this paper we propose a novel HW/SW collaborative technique that is able to efficiently execute workloads with increased utilization of FMA, by adding the option to get the same numerical result as separate FP multiply and FP add pairs. In particular, we extended the host ISA of a HW/SW co-designed processor with a new Combined Multiply-Add (CMA) instruction that performs an FMA operation with an intermediate rounding. This new instruction is used by a transparent dynamic translation software layer that uses a speculative instruction-fusion optimization to transform FP multiply and FP add sequences into CMA instructions. The FMA unit has been slightly modified to support both single-rounding and double-rounding fused instructions without increasing their latency and to provide a conservative fall-back path in case of misspeculation. Evaluation on a cycle-accurate timing simulator showed that CMA improved SPECfp performance by 6.3\% and reduced executed instructions by 4.7\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Schulte:2014:PCS, author = "Eric Schulte and Jonathan Dorn and Stephen Harding and Stephanie Forrest and Westley Weimer", title = "Post-compiler software optimization for reducing energy", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "639--652", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541980", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Modern compilers typically optimize for executable size and speed, rarely exploring non-functional properties such as power efficiency. These properties are often hardware-specific, time-intensive to optimize, and may not be amenable to standard dataflow optimizations. We present a general post-compilation approach called Genetic Optimization Algorithm (GOA), which targets measurable non-functional aspects of software execution in programs that compile to x86 assembly. GOA combines insights from profile-guided optimization, superoptimization, evolutionary computation and mutational robustness. GOA searches for program variants that retain required functional behavior while improving non-functional behavior, using characteristic workloads and predictive modeling to guide the search. The resulting optimizations are validated using physical performance measurements and a larger held-out test suite. Our experimental results on PARSEC benchmark programs show average energy reductions of 20\%, both for a large AMD system and a small Intel system, while maintaining program functionality on target workloads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Wood:2014:RSA, author = "David A. Wood", title = "Resolved: specialized architectures, languages, and system software should supplant general-purpose alternatives within a decade", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "653--654", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2563369", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The field of computing has struggled since its inception with the tension between specialization and generalization. Specialized architectures, programming languages, and system software promise better performance (across many metrics, including efficiency, productivity, etc.) for workloads that match their specialization objective. General-purpose architectures, languages, and system software sacrifice extremes of performance for specific workloads, seeking acceptable performance across a much wider range. While specialized alternatives have always had their place, general-purpose architectures, languages, and system software have dominated main-stream computing systems for the past several decades. But with Dennard scaling already gone and the end of Moore's Law looming, some have argued that general-purpose computing platforms must naturally give way to specialization. In this debate, two teams of highly-opinionated experts will debate the proposition that specialized architectures, languages, and system software should largely supplant general-purpose alternatives within the next decade. Arguments in favor of specialization include energy efficiency in the post-Dennard scaling era, performance scaling in the post-Moore's law era, and improvements in programmer productivity. Arguments against include the large investment needed to create specialized hardware and software components, lack of tools and interfaces to create reusable components, the semantic gap from overspecialization, and security vulnerabilities and general correctness issues due to interoperation of specialized components.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Ruwase:2014:GHF, author = "Olatunji Ruwase and Michael A. Kozuch and Phillip B. Gibbons and Todd C. Mowry", title = "{Guardrail}: a high fidelity approach to protecting hardware devices from buggy drivers", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "655--670", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541970", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Device drivers are an Achilles' heel of modern commodity operating systems, accounting for far too many system failures. Previous work on driver reliability has focused on protecting the kernel from unsafe driver side-effects by interposing an invariant-checking layer at the driver interface, but otherwise treating the driver as a black box. In this paper, we propose and evaluate Guardrail, which is a more powerful framework for run-time driver analysis that performs decoupled instruction-grain dynamic correctness checking on arbitrary kernel-mode drivers as they execute, thereby enabling the system to detect and mitigate more challenging correctness bugs (e.g., data races, uninitialized memory accesses) that cannot be detected by today's fault isolation techniques. Our evaluation of Guardrail shows that it can find serious data races, memory faults, and DMA faults in native Linux drivers that required fixes, including previously unknown bugs. Also, with hardware logging support, Guardrail can be used for online protection of persistent device state from driver bugs with at most 10\% overhead on the end-to-end performance of most standard I/O workloads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Wood:2014:LLD, author = "Benjamin P. Wood and Luis Ceze and Dan Grossman", title = "Low-level detection of language-level data races with {LARD}", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "671--686", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541955", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Researchers have proposed always-on data-race exceptions as a way to avoid the ill effects of data races, but slow performance of accurate dynamic data-race detection remains a barrier to the adoption of always-on data-race exceptions. Proposals for accurate low-level (e.g., hardware) data-race detection have the potential to reduce this performance barrier. This paper explains why low-level data-race detectors are wrong for programs written in high-level languages (e.g., Java): they miss true data races and report false data races in these programs. To bring the benefits of low-level data-race detection to high-level languages, we design low-level abstractable race detection (LARD), an extension of the interface between low-level data-race detectors and run-time systems that enables accurate language-level data-race detection using low-level detection mechanisms. We implement accurate LARD data-race exception support for Java, coupling a modified Jikes RVM Java virtual machine and a simulated hardware race detector. We evaluate our detector's accuracy against an accurate dynamic Java data-race detector and other low-level race detectors without LARD, showing that naive accurate low-level data-race detectors suffer from many missed and false language-level races in practice, and that LARD prevents this inaccuracy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Zhang:2014:EES, author = "Jiaqi Zhang and Lakshminarayanan Renganarayana and Xiaolan Zhang and Niyu Ge and Vasanth Bala and Tianyin Xu and Yuanyuan Zhou", title = "{EnCore}: exploiting system environment and correlation information for misconfiguration detection", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "687--700", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541983", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As software systems become more complex and configurable, failures due to misconfigurations are becoming a critical problem. Such failures often have serious functionality, security and financial consequences. Further, diagnosis and remediation for such failures require reasoning across the software stack and its operating environment, making it difficult and costly. We present a framework and tool called EnCore to automatically detect software misconfigurations. EnCore takes into account two important factors that are unexploited before: the interaction between the configuration settings and the executing environment, as well as the rich correlations between configuration entries. We embrace the emerging trend of viewing systems as data, and exploit this to extract information about the execution environment in which a configuration setting is used. EnCore learns configuration rules from a given set of sample configurations. With training data enriched with the execution context of configurations, EnCore is able to learn a broad set of configuration anomalies that spans the entire system. EnCore is effective in detecting both injected errors and known real-world problems --- it finds 37 new misconfigurations in Amazon EC2 public images and 24 new configuration problems in a commercial private cloud. By systematically exploiting environment information and by learning correlation rules across multiple configuration settings, EnCore detects 1.6x to 3.5x more misconfiguration anomalies than previous approaches.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Voskuilen:2014:HPF, author = "Gwendolyn Voskuilen and T. N. Vijaykumar", title = "High-performance fractal coherence", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "701--714", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541982", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Bugs in cache coherence protocols can cause system failures. Despite many advances, verification runs into state explosion for even moderately-sized systems. As multicores' core counts increase, coherence verifiability continues to be a key problem. A recent proposal, called fractal coherence, avoids the state explosion problem by applying the idea of observational equivalence between a larger system and its smaller sub-systems. A fractal protocol for a larger system is verified by design if a minimal sub-system is verified completely. While fractal coherence is a significant step forward, there are two shortcomings: (1) Architectural limitation: To achieve fractal coherence's logical hierarchy, TreeFractal, the specific fractal protocol, employs a tree architecture where each miss traverses many levels up and down the tree and each level redundantly holds its sub-trees' coherence tags. (2) Protocol restrictions: TreeFractal imposes a restriction on responses to read requests that forces read requests to obtain clean blocks from the nearest sharer even if the shared L2 or L3 is faster. These limitations impose significant performance and coherence tag state overheads. In this paper, we propose architectural support for coherence protocols to achieve scalable performance and verifiability. To address the architectural limitation, we propose FlatFractal, a directory-based architecture which decouples fractal coherence's logical hierarchy from the architecture and eliminates redundant tag state. To address the protocol restriction, we propose a simple change to the protocol that, while preserving observational equivalence, allows read requests to obtain the blocks from the shared L2 or L3. Our simulations show that for 16 cores, FlatFractal performs, on average, 57\% better than TreeFractal and within 3\% of a conventional directory.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Kwon:2014:LOC, author = "Woo-Cheol Kwon and Tushar Krishna and Li-Shiuan Peh", title = "Locality-oblivious cache organization leveraging single-cycle multi-hop {NoCs}", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "715--728", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541976", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Locality has always been a critical factor in on-chip data placement on CMPs as accessing further-away caches has in the past been more costly than accessing nearby ones. Substantial research on locality-aware designs have thus focused on keeping a copy of the data private. However, this complicates the problem of data tracking and search/invalidation; tracking the state of a line at all on-chip caches at a directory or performing full-chip broadcasts are both non-scalable and extremely expensive solutions. In this paper, we make the case for Locality-Oblivious Cache Organization (LOCO), a CMP cache organization that leverages the on-chip network to create virtual single-cycle paths between distant caches, thus redefining the notion of locality. LOCO is a clustered cache organization, supporting both homogeneous and heterogeneous cluster sizes, and provides near single-cycle accesses to data anywhere within the cluster, just like a private cache. Globally, LOCO dynamically creates a virtual mesh connecting all the clusters, and performs an efficient global data search and migration over this virtual mesh, without having to resort to full-chip broadcasts or perform expensive directory lookups. Trace-driven and full system simulations running SPLASH-2 and PARSEC benchmarks show that LOCO improves application run time by up to 44.5\% over baseline private and shared cache.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Kasture:2014:UEC, author = "Harshad Kasture and Daniel Sanchez", title = "{Ubik}: efficient cache sharing with strict {QoS} for latency-critical workloads", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "729--742", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541944", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Chip-multiprocessors (CMPs) must often execute workload mixes with different performance requirements. On one hand, user-facing, latency-critical applications (e.g., web search) need low tail (i.e., worst-case) latencies, often in the millisecond range, and have inherently low utilization. On the other hand, compute-intensive batch applications (e.g., MapReduce) only need high long-term average performance. In current CMPs, latency-critical and batch applications cannot run concurrently due to interference on shared resources. Unfortunately, prior work on quality of service (QoS) in CMPs has focused on guaranteeing average performance, not tail latency. In this work, we analyze several latency-critical workloads, and show that guaranteeing average performance is insufficient to maintain low tail latency, because microarchitectural resources with state, such as caches or cores, exert inertia on instantaneous workload performance. Last-level caches impart the highest inertia, as workloads take tens of milliseconds to warm them up. When left unmanaged, or when managed with conventional QoS frameworks, shared last-level caches degrade tail latency significantly. Instead, we propose Ubik, a dynamic partitioning technique that predicts and exploits the transient behavior of latency-critical workloads to maintain their tail latency while maximizing the cache space available to batch applications. Using extensive simulations, we show that, while conventional QoS frameworks degrade tail latency by up to 2.3x, Ubik simultaneously maintains the tail latency of latency-critical workloads and significantly improves the performance of batch applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Pichai:2014:ASA, author = "Bharath Pichai and Lisa Hsu and Abhishek Bhattacharjee", title = "Architectural support for address translation on {GPUs}: designing memory management units for {CPU\slash GPUs} with unified address spaces", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "743--758", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541942", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 4 07:12:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The proliferation of heterogeneous compute platforms, of which CPU/GPU is a prevalent example, necessitates a manageable programming model to ensure widespread adoption. A key component of this is a shared unified address space between the heterogeneous units to obtain the programmability benefits of virtual memory. To this end, we are the first to explore GPU Memory Management Units(MMUs) consisting of Translation Lookaside Buffers (TLBs) and page table walkers (PTWs) for address translation in unified heterogeneous systems. We show the performance challenges posed by GPU warp schedulers on TLBs accessed in parallel with L1 caches, which provide many well-known programmability benefits. In response, we propose modest TLB and PTW augmentations that recover most of the performance lost by introducing L1 parallel TLB access. We also show that a little TLB-awareness can make other GPU performance enhancements (e.g., cache-conscious warp scheduling and dynamic warp formation on branch divergence) feasible in the face of cache-parallel address translation, bringing overheads in the range deemed acceptable for CPUs (10-15\\% of runtime). We presume this initial design leaves room for improvement but anticipate that our bigger insight, that a little TLB-awareness goes a long way in GPUs, will spur further work in this fruitful area.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS '14 conference proceedings.", } @Article{Mondal:2014:DSM, author = "Subijit Mondal and Subhashis Maitra", title = "Data security-modified {AES} algorithm and its applications", journal = j-COMP-ARCH-NEWS, volume = "42", number = "2", pages = "1--8", month = may, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2669594.2669596", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 15 16:43:20 MDT 2014", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Now a days with the rapid development of multimedia technologies, research on safety and security are becoming more important. Multimedia data are generated and transmitted through the communication channels and the wireless media. The efficiencies of encryption based on different existing algorithms are not up to the satisfactory limit. Hence researchers are trying to modify the existing algorithm or even develop new algorithms that help to increase security with a little encryption time. Here in this paper, we have furnished a new technology to modify the AES algorithm which gives more security with a little encryption time and which can be used to encrypt using 128-bit key. Theoretical analysis on the proposed algorithm with the existing reveals the novelty of our work. Here we have proposed a technique to randomize the key and hidden the key data into an encrypted digital image using the basics concept of cryptography and also using the concept of digital watermarking, the concept of key-hide has also been encrypted. We have also proposed a new technique to reposition the pixels to break the correlation between them. So, the proposed scheme offers a more secure and cost effective mechanism for encryption.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sen:2014:TLT, author = "Soumik Sen and Subhashis Maitra", title = "Three levels three dimensional compact coding", journal = j-COMP-ARCH-NEWS, volume = "42", number = "2", pages = "9--14", month = may, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2669594.2669597", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 15 16:43:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Hardware and timing complexities are the major issues in current security related algorithms. Some of them shows better efficiency with respect to time and some of them reduce hardware complexities. Researchers try to solve both the problem at the same time in an efficient way. There are different existing algorithms which prove this efficiency. Here we will propose a new algorithm named as ``Three Levels Three Dimensional Compact Coding (TLTDCC)'' which will show better response time as well as it requires less hardware and also in security aspect, it will provide higher security. This paper explores a novelty of the work through a comparative study of the proposed algorithm with respect to different existing algorithms both in tabular method and graphically.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thomasian:2014:BDA, author = "Alexander Thomasian and Bingxing Liu and Yuhui Deng", title = "Balancing disk access times in {RAID5} disk arrays in degraded mode by conditionally prioritizing fork\slash join requests", journal = j-COMP-ARCH-NEWS, volume = "42", number = "2", pages = "15--19", month = may, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2669594.2669598", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 15 16:43:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "RAID5 disk arrays with rotated parities can tolerate single disk failures by reconstructing missing blocks on demand by XORing the contents of corresponding $K$ blocks on surviving disks by a $K$-way Fork/Join ( F/J ) request, which is considered completed after the $K$ disks are accessed. $ F / J$ accesses in RAID5 are processed concurrently with interfering disk accesses. The mean response time of F/J and independent/interfering requests: $ R^{F / J}$ /$_K$ and $ R^{\rm Ind}$ and the mean delay from the completion of the first to the last $ F / J$ task, known as task dispersion time: $ T^{\mr disp} /_K$, are performance metrics of interest. Given $ R^{F / J} /_K > R^{\rm Ind}$ with FCFS scheduling, it is desirable to equalize disk access times, but giving a higher nonpreemptive priority to disk accesses due to $ F / J$ requests with respect to interfering disk accesses results in $ R^{\rm Ind}$ \& $ R^{F / J} /_K$. We propose a continuum of conditional priority methods based on the fraction $F$ of $ F / J$ accesses completed with FCFS scheduling. $ F = \infty $ stands for FCFS and $ F = 0$ stands for unconditional priorities. Simulation shows that $ F = 1 / 8$ with $ K = 8$ yields $ R^{F / J} /_K \approx R^_{Ind}$ for three distributions of disk requests and in the range of $ F / J$ and independent disk requests considered. $F$ can be varied adaptively based on measurement results to balance disk access times.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gandhi:2014:BTI, author = "Jayneel Gandhi and Arkaprava Basu and Mark D. Hill and Michael M. Swift", title = "{BadgerTrap}: a tool to instrument x86-64 {TLB} misses", journal = j-COMP-ARCH-NEWS, volume = "42", number = "2", pages = "20--23", month = may, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2669594.2669599", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 15 16:43:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The overheads of memory management units (MMUs) have gained importance in today's systems. Detailed simulators may be too slow to gain insights into micro-architectural techniques that improve MMU efficiency. To address this issue, we propose a novel tool, BadgerTrap, which allows online instrumentation of TLB misses. It allows first-order analysis of new hardware techniques to improve MMU efficiency. The tool helps to create and analyze x86-64 TLB miss trace. We describe example studies to show various ways this tool can be applied to gain new research insights.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2014:INa, author = "Mark Thorson", title = "{Internet} nuggets", journal = j-COMP-ARCH-NEWS, volume = "42", number = "2", pages = "24--36", month = may, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2669594.2669601", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 15 16:43:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Towles:2014:UCI, author = "Brian Towles and J. P. Grossman and Brian Greskamp and David E. Shaw", title = "Unifying on-chip and inter-node switching within the {Anton 2} network", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "1--12", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665677", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The design of network architectures has become increasingly complex as the chips connected by inter-node networks have emerged as distributed systems in their own right, complete with their own on-chip networks. In Anton 2, a massively parallel special-purpose supercomputer for molecular dynamics simulations, we managed this complexity by reusing the on-chip network as a switch for inter-node traffic. This unified network approach introduces several design challenges. Maintaining fairness within the inter-node network is difficult, as each hop becomes a sequence of many on-chip routing decisions. We addressed this problem with an inverse-weighted arbiter that ensures fairness with low implementation costs. Balancing the load of inter-node traffic across the on-chip network is also critical, and we adopted an optimization approach to design an appropriate routing algorithm. Finally, the on-chip routers carry inter-node traffic, so they must implement inter-node virtual channels to avoid deadlock. In order to keep the routers small and fast, we developed a deadlock-free routing algorithm that reduces the number of virtual channels by one-third relative to previous approaches. The resulting Anton 2 network implementation efficiently utilizes its inter-node channels and provides low messaging latency, while occupying a modest amount of silicon area", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Putnam:2014:RFA, author = "Andrew Putnam and Adrian M. Caulfield and Eric S. Chung and Derek Chiou and Kypros Constantinides and John Demme and Hadi Esmaeilzadeh and Jeremy Fowers and Gopi Prashanth and Gopal Jan and Gray Michael and Haselman Scott Hauck and Stephen Heil and Amir Hormati and Joo-Young Kim and Sitaram Lanka and James Larus and Eric Peterson and Simon Pope and Aaron Smith and Jason Thong and Phillip Yi and Xiao Doug Burger", title = "A reconfigurable fabric for accelerating large-scale datacenter services", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "13--24", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665678", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Datacenter workloads demand high computational capabilities, flexibility, power efficiency, and low cost. It is challenging to improve all of these factors simultaneously. To advance datacenter capabilities beyond what commodity server designs can provide, we have designed and built a composable, reconfigurable fabric to accelerate portions of large-scale software services. Each instantiation of the fabric consists of a 6x8 2-D torus of high-end Stratix V FPGAs embedded into a half-rack of 48 machines. One FPGA is placed into each server, accessible through PCIe, and wired directly to other FPGAs with pairs of 10 Gb SAS cables In this paper, we describe a medium-scale deployment of this fabric on a bed of 1,632 servers, and measure its efficacy in accelerating the Bing web search engine. We describe the requirements and architecture of the system, detail the critical engineering challenges and solutions needed to make the system robust in the presence of failures, and measure the performance, power, and resilience of the system when ranking candidate documents. Under high load, the large-scale reconfigurable fabric improves the ranking throughput of each server by a factor of 95\% for a fixed latency distribution --- or, while maintaining equivalent throughput, reduces the tail latency by 29\%", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Daya:2014:SCR, author = "Bhavya K. Daya and Chia-Hsin Owen Chen and Suvinay Subramanian and Woo-Cheol Kwon and Sunghyun Park and Tushar Krishna and Jim Holt and Anantha P. Chandrakasan and Li-Shiuan Peh", title = "{SCORPIO}: a $ 36$-core research chip demonstrating snoopy coherence on a scalable mesh {NoC} with in-network ordering", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "25--36", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665680", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In the many-core era, scalable coherence and on-chip interconnects are crucial for shared memory processors. While snoopy coherence is common in small multicore systems, directory-based coherence is the de facto choice for scalability to many cores, as snoopy relies on ordered interconnects which do not scale. However, directory-based coherence does not scale beyond tens of cores due to excessive directory area overhead or inaccurate sharer tracking. Prior techniques supporting ordering on arbitrary unordered networks are impractical for full multicore chip designs We present SCORPIO, an ordered mesh Network-on-Chip (NoC) architecture with a separate fixed-latency, bufferless network to achieve distributed global ordering. Message delivery is decoupled from the ordering, allowing messages to arrive in any order and at any time, and still be correctly ordered. The architecture is designed to plug-and-play with existing multicore IP and with practicality, timing, area, and power as top concerns. Full-system 36 and 64-core simulations on SPLASH-2 and PARSEC benchmarks show an average application runtime reduction of 24.1\% and 12.9\%, in comparison to distributed directory and AMD HyperTransport coherence protocols, respectively The SCORPIO architecture is incorporated in an 11 mm-by-13mm chip prototype, fabricated in IBM 45nm SOI technology, comprising 36 Freescale e200 Power Architecture\TM{} cores with private L1 and L2 caches interfacing with the NoC via ARM AMBA, along with two Cadence on-chip DDR2 controllers. The chip prototype achieves a post synthesis operating frequency of 1 GHz (833MHz post-layout) with an estimated power of 28.8W (768mW per tile), while the network consumes only 10\% of tile area and 19 \% of tile power.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Upasani:2014:ACD, author = "Gaurang Upasani and Xavier Vera and Antonio Gonz{\'a}lez", title = "Avoiding core's {DUE \& SDC} via acoustic wave detectors and tailored error containment and recovery", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "37--48", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665682", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The trend of downsizing transistors and operating voltage scaling has made the processor chip more sensitive against radiation phenomena making soft errors an important challenge. New reliability techniques for handling soft errors in the logic and memories that allow meeting the desired failures-in-time (FIT) target are key to keep harnessing the benefits of Moore's law. The failure to scale the soft error rate caused by particle strikes, may soon limit the total number of cores that one may have running at the same time This paper proposes a light-weight and scalable architecture to eliminate silent data corruption errors (SDC) and detected unrecoverable errors (DUE) of a core. The architecture uses acoustic wave detectors for error detection. We propose to recover by confining the errors in the cache hierarchy, allowing us to deal with the relatively long detection latencies. Our results show that the proposed mechanism protects the whole core (logic, latches and memory arrays) incurring performance overhead as low as 0.60\%", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Chen:2014:MLC, author = "Long Chen and Zhao Zhang", title = "{MemGuard}: a low cost and energy efficient design to support and enhance memory system reliability", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "49--60", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665683", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Memory system reliability is increasingly a concern as memory cell density and capacity continue to grow. The conventional approach is to use redundant memory bits for error detection and correction, with significant storage, cost and power overheads. In this paper, we propose a novel, system-level scheme called MemGuard for memory error detection. With OS-based checkpointing, it is also able to recover program execution from memory errors. The memory error detection of MemGuard is motivated by memory integrity verification using log hashes. It is much stronger than SECDED in error detection, incurs negligible hardware cost and energy overhead and no storage overhead, and is compatible with various memory organizations. It may play the role of ECC memory in consumer-level computers and mobile devices, without the shortcomings of ECC memory. In server computers, it may complement SECDED ECC or Chipkill Correct by providing even stronger error detection. We have comprehensively investigated and evaluated the feasibility and reliability of MemGuard. We show that using an incremental multiset hash function and a non-cryptographic hash function, the performance and energy overheads of MemGuard are negligible. We use the mathematical deduction and synthetic simulation to prove that MemGuard is robust and reliable.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Hari:2014:GGE, author = "Siva Kumar Sastry Hari and Radha Venkatagiri and Sarita V. Adve and Helia Naeimi", title = "{GangES}: gang error simulation for hardware resiliency evaluation", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "61--72", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665685", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As technology scales, the hardware reliability challenge affects a broad computing market, rendering traditional redundancy based solutions too expensive. Software anomaly based hardware error detection has emerged as a low cost reliability solution, but suffers from Silent Data Corruptions (SDCs). It is crucial to accurately evaluate SDC rates and identify SDC producing software locations to develop software-centric low-cost hardware resiliency solutions. A recent tool, called Relyzer, systematically analyzes an entire application's resiliency to single bit soft-errors using a small set of carefully selected error injection sites. Relyzer provides a practical resiliency evaluation mechanism but still requires significant evaluation time, most of which is spent on error simulations. This paper presents a new technique called GangES (Gang Error Simulator) that aims to reduce error simulation time. GangES observes that a set or gang of error simulations that result in the same intermediate execution state (after their error injections) will produce the same error outcome; therefore, only one simulation of the gang needs to be completed, resulting in significant overall savings in error simulation time. GangES leverages program structure to carefully select when to compare simulations and what state to compare. For our workloads, GangES saves 57\% of the total error simulation time with an overhead of just 1.6\% This paper also explores pure program analyses based techniques that could obviate the need for tools such as GangES altogether. The availability of Relyzer+GangES allows us to perform a detailed evaluation of such techniques. We evaluate the accuracy of several previously proposed program metrics. We find that the metrics we considered and their various linear combinations are unable to adequately predict an instruction's vulnerability to SDCs, further motivating the use of Relyzer+GangES style techniques as valuable solutions for the hardware error resiliency evaluation problem", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Wadden:2014:RWD, author = "Jack Wadden and Alexander Lyashevsky and Sudhanva Gurumurthi and Vilas Sridharan and Kevin Skadron", title = "Real-world design and evaluation of compiler-managed {GPU} redundant multithreading", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "73--84", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665686", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Reliability for general purpose processing on the GPU (GPGPU) is becoming a weak link in the construction of reliable supercomputer systems. Because hardware protection is expensive to develop, requires dedicated on-chip resources, and is not portable across different architectures, the efficiency of software solutions such as redundant multithreading (RMT) must be explored. This paper presents a real-world design and evaluation of automatic software RMT on GPU hardware. We first describe a compiler pass that automatically converts GPGPU kernels into redundantly threaded versions. We then perform detailed power and performance evaluations of three RMT algorithms, each of which provides fault coverage to a set of structures in the GPU. Using real hardware, we show that compiler-managed software RMT has highly variable costs. We further analyze the individual costs of redundant work scheduling, redundant computation, and inter-thread communication, showing that no single component in general is responsible for high overheads across all applications; instead, certain workload properties tend to cause RMT to perform well or poorly. Finally, we demonstrate the benefit of architectural support for RMT with a specific example of fast, register-level thread communication", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Chen:2014:ARA, author = "Tianshi Chen and Qi Guo and Ke Tang and Olivier Temam and Zhiwei Xu and Zhi-Hua Zhou and Yunji Chen", title = "{ArchRanker}: a ranking approach to design space exploration", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "85--96", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665688", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Architectural Design Space Exploration (DSE) is a notoriously difficult problem due to the exponentially large size of the design space and long simulation times. Previously, many studies proposed to formulate DSE as a regression problem which predicts architecture responses (e.g., time, power) of a given architectural configuration. Several of these techniques achieve high accuracy, though often at the cost of significant simulation time for training the regression models. We argue that the information the architect mostly needs during the DSE process is whether a given configuration will perform better than another one in the presences of design constraints, or better than any other one seen so far, rather than precisely estimating the performance of that configuration. Based on this observation, we propose a novel ranking-based approach to DSE where we train a model to predict which of two architecture configurations will perform best. We show that, not only this ranking model more accurately predicts the relative merit of two architecture configurations than an ANN-based state-of-the-art regression model, but also that it requires much fewer training simulations to achieve the same accuracy, or that it can be used for and is even better at quantifying the performance gap between two configurations We implement the framework for training and using this model, called ArchRanker, and we evaluate it on several DSE scenarios (unicore/multicore design spaces, and both time and power performance metrics). We try to emulate as closely as possible the DSE process by creating constraint-based scenarios, or an iterative DSE process. We find that ArchRanker makes 29:68\% to 54:43\% fewer incorrect predictions on pairwise relative merit of configurations (tested with 79,800 configuration pairs) than an ANN-based regression model across all DSE scenarios considered (values averaged over all benchmarks for each scenario). We also find that, to achieve the same accuracy as ArchRanker, the ANN often requires three times more training simulations", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Shao:2014:APR, author = "Yakun Sophia Shao and Brandon Reagen and Gu-Yeon Wei and David Brooks", title = "{Aladdin}: a {Pre-RTL}, power-performance accelerator simulator enabling large design space exploration of customized architectures", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "97--108", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665689", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Hardware specialization, in the form of accelerators that provide custom datapath and control for specific algorithms and applications, promises impressive performance and energy advantages compared to traditional architectures. Current research in accelerator analysis relies on RTL-based synthesis flows to produce accurate timing, power, and area estimates. Such techniques not only require significant effort and expertise but are also slow and tedious to use, making large design space exploration infeasible. To overcome this problem, we present Aladdin, a pre-RTL, power-performance accelerator modeling framework and demonstrate its application to system-on-chip (SoC) simulation. Aladdin estimates performance, power, and area of accelerators within 0.9\%, 4.9\%, and 6.6\% with respect to RTL implementations. Integrated with architecture-level core and memory hierarchy simulators, Aladdin provides researchers an approach to model the power and performance of accelerators in an SoC environment", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Badr:2014:SST, author = "Mario Badr and Natalie Enright Jerger", title = "{SynFull}: synthetic traffic models capturing cache coherent behaviour", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "109--120", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665691", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Modern and future many-core systems represent complex architectures. The communication fabrics of these large systems heavily influence their performance and power consumption. Current simulation methodologies for evaluating networks-on-chip (NoCs) are not keeping pace with the increased complexity of our systems; architects often want to explore many different design knobs quickly. Methodologies that capture workload trends with faster simulation times are highly beneficial at early stages of architectural exploration. We propose SynFull, a synthetic traffic generation methodology that captures both application and cache coherence behaviour to rapidly evaluate NoCs. SynFull allows designers to quickly indulge in detailed performance simulations without the cost of long-running full-system simulation. By capturing a full range of application and coherence behaviour, architects can avoid the over- or under-design of the network as may occur when using traditional synthetic traffic patterns such as uniform random. SynFull has errors as low as 0.3\% and provides 50x speedup on average over full-system simulation", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Venkat:2014:HID, author = "Ashish Venkat and Dean M. Tullsen", title = "Harnessing {ISA} diversity: design of a {heterogeneous-ISA} chip multiprocessor", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "121--132", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665692", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Heterogeneous multicore architectures have the potential for high performance and energy efficiency. These architectures may be composed of small power-efficient cores, large high-performance cores, and/or specialized cores that accelerate the performance of a particular class of computation. Architects have explored multiple dimensions of heterogeneity, both in terms of micro-architecture and specialization. While early work constrained the cores to share a single ISA, this work shows that allowing heterogeneous ISAs further extends the effectiveness of such architectures This work exploits the diversity offered by three modern ISAs: Thumb, x86-64, and Alpha. This architecture has the potential to outperform the best single-ISA heterogeneous architecture by as much as 21\%, with 23\% energy savings and a reduction of 32\% in Energy Delay Product.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Sembrant:2014:DDD, author = "Andreas Sembrant and Erik Hagersten and David Black-Schaffer", title = "The {Direct-to-Data (D2D)} cache: navigating the cache hierarchy with a single lookup", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "133--144", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665694", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Modern processors optimize for cache energy and performance by employing multiple levels of caching that address bandwidth, low-latency and high-capacity. A request typically traverses the cache hierarchy, level by level, until the data is found, thereby wasting time and energy in each level. In this paper, we present the Direct-to-Data (D2D) cache that locates data across the entire cache hierarchy with a single lookup. To navigate the cache hierarchy, D2D extends the TLB with per cache-line location information that indicates in which cache and way the cache line is located. This allows the D2D cache to: (1) skip levels in the hierarchy (by accessing the right cache level directly), (2) eliminate extra data array reads (by reading the right way directly), (3) avoid tag comparisons (by eliminating the tag arrays), and (4) go directly to DRAM on cache misses (by checking the TLB). This reduces the L2 latency by 40\% and saves 5-17\% of the total cache hierarchy energy D2D's lower L2 latency directly improves L2 sensitive applications' performance by 5-14\%. More significantly, we can take advantage of the L2 latency reduction to optimize other parts of the micro-architecture. For example, we can reduce the ROB size for the L2 bound applications by 25\%, or we can reduce the L1 cache size, delivering an overall 21\% energy savings across all benchmarks, without hurting performance.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Arelakis:2014:SSC, author = "Angelos Arelakis and Per Stenstrom", title = "{SC2}: a statistical compression cache scheme", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "145--156", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665696", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Low utilization of on-chip cache capacity limits performance and wastes energy because of the long latency, limited bandwidth, and energy consumption associated with off-chip memory accesses. Value replication is an important source of low capacity utilization. While prior cache compression techniques manage to code frequent values densely, they trade off a high compression ratio for low decompression latency, thus missing opportunities to utilize capacity more effectively. This paper presents, for the first time, a detailed design space exploration of caches that utilize statistical compression. We show that more aggressive approaches like Huffman coding, which have been neglected in the past due to the high processing overhead for (de)compression, are suitable techniques for caches and memory. Based on our key observation that value locality varies little over time and across applications, we first demonstrate that the overhead of statistics acquisition for code generation is low because new encodings are needed rarely, making it possible to off-load it to software routines. We then show that the high compression ratio obtained by Huffman-coding makes it possible to utilize the performance benefits of 4X larger last-level caches with about 50\% lower power consumption than such larger caches", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Seshadri:2014:DBI, author = "Vivek Seshadri and Abhishek Bhowmick and Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch and Todd C. Mowry", title = "The dirty-block index", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "157--168", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665697", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "On-chip caches maintain multiple pieces of metadata about each cached block --- e.g., dirty bit, coherence information, ECC. Traditionally, such metadata for each block is stored in the corresponding tag entry in the tag store. While this approach is simple to implement and scalable, it necessitates a full tag store lookup for any metadata query --- resulting in high latency and energy consumption. We find that this approach is inefficient and inhibits several cache optimizations. In this work, we propose a new way of organizing the dirty bit information that enables simpler and more efficient implementations of several optimizations. In our proposed approach, we remove the dirty bits from the tag store and organize it differently in a separate structure, which we call the Dirty-Block Index (DBI). The organization of DBI is simple: it consists of multiple entries, each corresponding to some row in DRAM. A bit vector in each entry tracks whether or not each block in the corresponding DRAM row is dirty We demonstrate the benefits of DBI by using it to simultaneously and efficiently implement three optimizations proposed by prior work: (1) Aggressive DRAM-aware writeback, (2) Bypassing cache lookups, and (3) Heterogeneous ECC for clean/dirty blocks. DBI, with all three optimizations enabled, improves performance by 31\% compared to the baseline (by 6\% compared to the best previous mechanism) while reducing overall cache area cost by 8\% compared to prior approaches.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Liu:2014:GVM, author = "Lei Liu and Yong Li and Zehan Cui and Yungang Bao and Mingyu Chen and Chengyong Wu", title = "Going vertical in memory management: handling multiplicity by multi-policy", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "169--180", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665698", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Many emerging applications from various domains often exhibit heterogeneous memory characteristics. When running in combination on parallel platforms, these applications present a daunting variety of workload behaviors that challenge the effectiveness of any memory allocation strategy. Prior partitioning-based or random memory allocation schemes typically manage only one level of the memory hierarchy and often target specific workloads. To handle diverse and dynamically changing memory and cache allocation needs, we augment existing ``horizontal'' cache/DRAM bank partitioning with vertical partitioning and explore the resulting multi-policy space. We study the performance of these policies for over 2000 workloads and correlate the results with application characteristics via a data mining approach. Based on this correlation we derive several practical memory allocation rules that we integrate into a unified multi-policy framework to guide resources partitioning and coalescing for dynamic and diverse multiprogrammed/ threaded workloads. We implement our approach in Linux kernel 2.6.32 as a restructured page indexing system plus a series of kernel modules. Extensive experiments show that, in practice, our framework can select proper memory allocation policy and consistently outperforms the unmodified Linux kernel, achieving up to 11\% performance gains compared to prior techniques", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Orr:2014:FGT, author = "Marc S. Orr and Bradford M. Beckmann and Steven K. Reinhardt and David A. Wood", title = "Fine-grain task aggregation and coordination on {GPUs}", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "181--192", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665701", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In general-purpose graphics processing unit (GPGPU) computing, data is processed by concurrent threads executing the same function. This model, dubbed single-instruction/multiple-thread (SIMT), requires programmers to coordinate the synchronous execution of similar opera-tions across thousands of data elements. To alleviate this programmer burden, Gaster and Howes outlined the channel abstraction, which facilitates dynamically aggregating asynchronously produced fine-grain work into coarser-grain tasks. However, no practical implementation has been proposed To this end, we propose and evaluate the first channel implementation. To demonstrate the utility of channels, we present a case study that maps the fine-grain, recursive task spawning in the Cilk programming language to channels by representing it as a flow graph. To support data-parallel recursion in bounded memory, we propose a hardware mechanism that allows wavefronts to yield their execution resources. Through channels and wavefront yield, we implement four Cilk benchmarks. We show that Cilk can scale with the GPU architecture, achieving speedups of as much as 4.3x on eight compute units", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Tanasic:2014:EPM, author = "Ivan Tanasic and Isaac Gelado and Javier Cabezas and Alex Ramirez and Nacho Navarro and Mateo Valero", title = "Enabling preemptive multiprogramming on {GPUs}", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "193--204", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665702", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "GPUs are being increasingly adopted as compute accelerators in many domains, spanning environments from mobile systems to cloud computing. These systems are usually running multiple applications, from one or several users. However GPUs do not provide the support for resource sharing traditionally expected in these scenarios. Thus, such systems are unable to provide key multiprogrammed workload requirements, such as responsiveness, fairness or quality of service. In this paper, we propose a set of hardware extensions that allow GPUs to efficiently support multiprogrammed GPU workloads. We argue for preemptive multitasking and design two preemption mechanisms that can be used to implement GPU scheduling policies. We extend the architecture to allow concurrent execution of GPU kernels from different user processes and implement a scheduling policy that dynamically distributes the GPU cores among concurrently running kernels, according to their priorities. We extend the NVIDIA GK110 (Kepler) like GPU architecture with our proposals and evaluate them on a set of multiprogrammed workloads with up to eight concurrent processes. Our proposals improve execution time of high-priority processes by 15.6x, the average application turnaround time between 1.5x to 2x, and system fairness up to 3.4x", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Voitsechov:2014:SGM, author = "Dani Voitsechov and Yoav Etsion", title = "Single-graph multiple flows: energy efficient design alternative for {GPGPUs}", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "205--216", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665703", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We present the single-graph multiple-flows (SGMF) architecture that combines coarse-grain reconfigurable computing with dynamic dataflow to deliver massive thread-level parallelism. The CUDA-compatible SGMF architecture is positioned as an energy efficient design alternative for GPGPUs. The architecture maps a compute kernel, represented as a dataflow graph, onto a coarse-grain reconfigurable fabric composed of a grid of interconnected functional units. Each unit dynamically schedules instances of the same static instruction originating from different CUDA threads. The dynamically scheduled functional units enable streaming the data of multiple threads (or graph flows, in SGMF parlance) through the grid. The combination of statically mapped instructions and direct communication between functional units obviate the need for a full instruction pipeline and a centralized register file, whose energy overheads burden GPGPU We show that the SGMF architecture delivers performance comparable to that of contemporary GPGPUs while consuming 57\% less energy on average.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Campanoni:2014:HRA, author = "Simone Campanoni and Kevin Brownell and Svilen Kanev and Timothy M. Jones and Gu-Yeon Wei and David Brooks", title = "{HELIX--RC}: an architecture-compiler co-design for automatic parallelization of irregular programs", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "217--228", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665705", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Data dependences in sequential programs limit parallelization because extracted threads cannot run independently. Although thread-level speculation can avoid the need for precise dependence analysis, communication overheads required to synchronize actual dependences counteract the benefits of parallelization. To address these challenges, we propose a lightweight architectural enhancement co-designed with a parallelizing compiler, which together can decouple communication from thread execution. Simulations of these approaches, applied to a processor with 16 Intel Atom-like cores, show an average of 6.85x performance speedup for six SPEC CINT2000 benchmarks", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Smith:2014:EDN, author = "James E. Smith", title = "Efficient digital neurons for large scale cortical architectures", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "229--240", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665707", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Digital neurons are implemented with the goal of sup-porting research and development of architectures which implement the computational paradigm of the neocortex. Four spiking digital neurons are implemented at the register transfer level in a manner that permits side-by-side comparisons. Two of the neurons contain two stages of exponential decay, one for synapse conductances and one for membrane potential. The other two neurons contain only one stage of exponential decay for membrane potential. The two stage neurons respond to an input spike with a change in membrane potential that has a non-infinite leading edge slope; the one stage neurons exhibit a change in membrane potential with an abrupt, infinite leading edge slope. This leads to a behavioral difference when a number of input spikes occur in very close time proximity. However, the one stage neurons are as much as a factor of ten more energy efficient than the two stage neurons, as measured by the number of dynamic add-equivalent operations. A new two stage neuron is proposed. This neuron reduces the number of decay components and implements decays in both stages via piece-wise linear approximation. Together, these simplifications yield two stage neuron behavior with energy efficiency that is only about a factor of two worse than the simplest one stage neuron.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Swaminathan:2014:EAS, author = "Karthik Swaminathan and Huichu Liu and Jack Sampson and Vijaykrishnan Narayanan", title = "An examination of the architecture and system-level tradeoffs of employing steep slope devices in {$3$D} {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "241--252", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665709", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "For any given application, there is an optimal throughput point in the space of per-processor performance and the number of such processors given to that application. However, due to thermal, yield, and other constraints, not all of these optimal points can plausibly be constructed with a given technology. In this paper, we look at how emerging steep slope devices, 3D circuit integration, and trends in process technology scaling will combine to shift the boundaries of both attainable performance, and the optimal set of technologies to employ to achieve it. We propose a heterogeneous-technology 3D architecture capable of operating efficiently at an expanded number of points in this larger design space and devise a heterogeneity and thermal aware scheduling algorithm to exploit its potential. Our heterogeneous mapping techniques are capable of producing speedups ranging from 17\% for a high end server workloads running at around 90${}^\circ $C to over 160\% for embedded systems running below 60${}^\circ $C", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Venkatesan:2014:SST, author = "Rangharajan Venkatesan and Shankar Ganesh Ramasubramanian and Swagath Venkataramani and Kaushik Roy and Anand Raghunathan", title = "{STAG}: spintronic-tape architecture for {GPGPU} cache hierarchies", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "253--264", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665710", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "General-purpose Graphics Processing Units (GPGPUs) are widely used for executing massively parallel workloads from various application domains. Feeding data to the hundreds to thousands of cores that current GPGPUs integrate places great demands on the memory hierarchy, fueling an ever-increasing demand for on-chip memory. In this work, we propose STAG, a high density, energy-efficient GPGPU cache hierarchy design using a new spintronic memory technology called Domain Wall Memory (DWM). DWMs inherently offer unprecedented benefits in density by storing multiple bits in the domains of a ferromagnetic nanowire, which logically resembles a bit-serial tape. However, this structure also leads to a unique challenge that the bits must be sequentially accessed by performing ``shift'' operations, resulting in variable and potentially higher access latencies. To address this challenge, STAG utilizes a number of architectural techniques : (i) a hybrid cache organization that employs different DWM bit-cells to realize the different memory arrays within the GPGPU cache hierarchy, (ii) a clustered, bit-interleaved organization, in which the bits in a cache block are spread across a cluster of DWM tapes, allowing parallel access, (iii) tape head management policies that predictively configure DWM arrays to reduce the expected number of shift operations for subsequent accesses, and (iv) a shift aware promotion buffer (SaPB), in which accesses to the DWM cache are predicted based on intra-warp locality, and locations that would incur a large shift penalty are promoted to a smaller buffer. Over a wide range of benchmarks from the Rodinia, ISPASS and Parboil suites, STAG achieves significant benefits in performance (12.1\% over SRAM and 5.8\% over STT-MRAM) and energy (3.3X over SRAM and 2.6X over STT-MRAM)", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Pelley:2014:MP, author = "Steven Pelley and Peter M. Chen and Thomas F. Wenisch", title = "Memory persistency", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "265--276", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665712", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Emerging nonvolatile memory technologies (NVRAM) promise the performance of DRAM with the persistence of disk. However, constraining NVRAM write order, necessary to ensure recovery correctness, limits NVRAM write concurrency and degrades throughput. We require new memory interfaces to minimally describe write constraints and allow high performance and high concurrency data structures. These goals strongly resemble memory consistency. Whereas memory consistency concerns the order that memory operations are observed between numerous processors, persistent memory systems must constrain the order that writes occur with respect to failure. We introduce memory persistency, a new approach to designing persistent memory interfaces, building on memory consistency. Similar to memory consistency, memory persistency models may be relaxed to improve performance. We describe the design space of memory persistency and desirable features that such a memory system requires. Finally, we introduce several memory persistency models and evaluate their ability to expose NVRAM write concurrency using two implementations of a persistent queue. Our results show that relaxed persistency models accelerate system throughput 30-fold by reducing NVRAM write constraints", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Hoseinzadeh:2014:RAL, author = "Morteza Hoseinzadeh and Mohammad Arjomand and Hamid Sarbazi-Azad", title = "Reducing access latency of {MLC PCMs} through line striping", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "277--288", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665713", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Although phase change memory with multi-bit storage capability (known as MLC PCM) offers a good combination of high bit-density and non-volatility, its performance is severely impacted by the increased read/write latency. Regarding read operation, access latency increases almost linearly with respect to cell density (the number of bits stored in a cell). Since reads are latency critical, they can seriously impact system performance. This paper alleviates the problem of slow reads in the MLC PCM by exploiting a fundamental property of MLC devices: the Most-Significant Bit (MSB) of MLC cells can be read as fast as SLC cells, while reading the Least-Significant Bits (LSBs) is slower. We propose Striped PCM (SPCM), a memory architecture that leverages this property to keep MLC read latency in the order of SLC's. In order to avoid extra writes onto memory cells as a result of striping memory lines, the proposed design uses a pairing write queue to synchronize write-back requests associated with blocks that are paired in striping mode. Our evaluation shows that our design significantly improves the average memory access latency by more than 30\% and IPC by up to 25\% (10\%, on average), with a slight overhead in memory energy (0.7\%) in a 4-core CMP model running memory-intensive benchmarks", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Jung:2014:HHI, author = "Myoungsoo Jung and Wonil Choi and Shekhar Srikantaiah and Joonhyuk Yoo and Mahmut T. Kandemir", title = "{HIOS}: a host interface {I/O} scheduler for solid state disks", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "289--300", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665715", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Garbage collection (GC) and resource contention on I/O buses (channels) are among the critical bottlenecks in Solid State Disks (SSDs) that cannot be easily hidden. Most existing I/O scheduling algorithms in the host interface logic (HIL) of state-of-the-art SSDs are oblivious to such low-level performance bottlenecks in SSDs. As a result, SSDs may violate quality of service (QoS) requirements by not being able to meet the deadlines of I/O requests. In this paper, we propose a novel host interface I/O scheduler that is both GC-aware and QoS-aware. The proposed scheduler redistributes the GC overheads across non-critical I/O requests and reduces channel resource contention. Our experiments with workloads from various application domains reveal that the proposed scheduler reduces the standard deviation for latency over state-of-the-art I/O schedulers used in the HIL by 52.5\%, and the worst-case latency by 86.6\%. In addition, for I/O requests with sizes smaller than a superpage, our proposed scheduler avoids channel resource conflicts and reduces latency by 29.2\% compared to the state-of-the-art", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Lo:2014:TEP, author = "David Lo and Liqun Cheng and Rama Govindaraju and Luiz Andr{\'e} Barroso and Christos Kozyrakis", title = "Towards energy proportionality for large-scale latency-critical workloads", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "301--312", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665718", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Reducing the energy footprint of warehouse-scale computer (WSC) systems is key to their affordability, yet difficult to achieve in practice. The lack of energy proportionality of typical WSC hardware and the fact that important workloads (such as search) require all servers to remain up regardless of traffic intensity renders existing power management techniques ineffective at reducing WSC energy use. We present PEGASUS, a feedback-based controller that significantly improves the energy proportionality of WSC systems, as demonstrated by a real implementation in a Google search cluster. PEGASUS uses request latency statistics to dynamically adjust server power management limits in a fine-grain manner, running each server just fast enough to meet global service-level latency objectives. In large cluster experiments, PEGASUS reduces power consumption by up to 20\%. We also estimate that a distributed version of PEGASUS can nearly double these savings", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Liu:2014:SRJ, author = "Yanpei Liu and Stark C. Draper and Nam Sung Kim", title = "{SleepScale}: runtime joint speed scaling and sleep states management for power efficient data centers", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "313--324", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665719", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Power consumption in data centers has been growing significantly in recent years. To reduce power, servers are being equipped with increasingly sophisticated power management mechanisms. Different mechanisms offer dramatically different trade-offs between power savings and performance penalties. Considering the complexity, variety, and temporally varying nature of the applications hosted in a typical data center, intelligently determining which power management policy to use and when is a complicated task. In this paper we analyze a system model featuring both performance scaling and low-power states. We reveal the interplay between performance scaling and low-power states via intensive simulation and analytic verification. Based on the observations, we present SleepScale, a runtime power management tool designed to efficiently exploit existing power control mechanisms. At run time, SleepScale characterizes power consumption and quality-of-service (QoS) for each low-power state and frequency setting, and selects the best policy for a given QoS constraint. We evaluate SleepScale using workload traces from data centers and achieve significant power savings relative to conventional power management strategies", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Liu:2014:OVM, author = "Ming Liu and Tao Li", title = "Optimizing virtual machine consolidation performance on {NUMA} server architecture for cloud workloads", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "325--336", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665720", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Server virtualization and workload consolidation enable multiple workloads to share a single physical server, resulting in significant energy savings and utilization improvements. The shift of physical server architectures to NUMA and the increasing popularity of scale-out cloud applications undermine workload consolidation efficiency and result in overall system degradation. In this work, we characterize the consolidation of cloud workloads on NUMA virtualized systems, estimate four different sources of architecture overhead, and explore optimization opportunities beyond the default NUMA-aware hypervisor memory management Motivated by the observed architectural impact on cloud workload consolidation performance, we propose three optimization techniques incorporating NUMA access overhead into the hypervisor's virtual machine memory allocation and page fault handling routines. Among these, estimation of the memory zone access overhead serves as a foundation for the other two techniques: a NUMA overhead aware buddy allocator and a P2M swap FIFO. Cache hit rate, cycle loss due to cache miss, and IPC serve as indicators to estimate the access cost of each memory node. Our optimized buddy allocator dynamically selects low-overhead memory zones and ``proportionally'' distributes memory pages across target nodes. The P2M swap FIFO records recently unused PFN, MFN lists for mapping exchanges to rebalance memory access pressure within one domain. Our real system based evaluations show a 41.1\% performance improvement when consolidating 16-VMs on a 4-socket server (the proposed allocator contributes 22.8\% of the performance gain and the P2M swap FIFO accounts for the rest). Furthermore, our techniques can cooperate well with other methods (i.e. vCPU migration) and scale well when varying VM memory size and the number of sockets in a physical host", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{O:2014:RBD, author = "Seongil O and Young Hoon Son and Nam Sung Kim and Jung Ho Ahn", title = "Row-buffer decoupling: a case for low-latency {DRAM} microarchitecture", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "337--348", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665723", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Modern DRAM devices for the main memory are structured to have multiple banks to satisfy ever-increasing throughput, energy-efficiency, and capacity demands. Due to tight cost constraints, only one row can be buffered (opened) per bank and actively service requests at a time, while the row must be deactivated (closed) before a new row is stored into the row buffers. Hasty deactivation unnecessarily re-opens rows for otherwise row-buffer hits while hindsight accompanies the deactivation process on the critical path of accessing data for row-buffer misses. The time to (de)activate a row is comparable to the time to read an open row while applications are often sensitive to DRAM latency. Hence, it is critical to make the right decision on when to close a row. However, the increasing number of banks per DRAM device over generations reduces the number of requests per bank. This forces a memory controller to frequently predict when to close a row due to a lack of information on future requests, while the dynamic nature of memory access patterns limits the prediction accuracy In this paper, we propose a novel DRAM microarchitecture that can eliminate the need for any prediction. First, we identify that precharging the bitlines dominates the deactivate time, while sense amplifiers that work as a row buffer are physically coupled with the bitlines such that a single command precharges both bitlines and sense amplifiers simultaneously. By decoupling the bitlines from the row buffers using isolation transistors, the bitlines can be precharged right after a row becomes activated. Therefore, only the sense amplifiers need to be precharged for a miss in most cases, taking an order of magnitude shorter time than the conventional deactivation process. Second, we show that this row-buffer decoupling enables internal DRAM ?-operations to be separated and recombined, which can be exploited by memory controllers to make the main memory system more energy efficient. Our experiments demonstrate that row-buffer decoupling improves the geometric mean of the instructions per cycle and MIPS2/W by 14\% and 29\%, respectively, for memory-intensive SPEC CPU2006 applications", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Zhang:2014:HDH, author = "Tao Zhang and Ke Chen and Cong Xu and Guangyu Sun and Tao Wang and Yuan Xie", title = "{Half-DRAM}: a high-bandwidth and low-power {DRAM} architecture from the rethinking of fine-grained activation", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "349--360", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665724", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "DRAM memory is a major contributor for the total power consumption in modern computing systems. Consequently, power reduction for DRAM memory is critical to improve system-level power efficiency. Fine-grained DRAM architecture [1, 2] has been proposed to reduce the activation/ precharge power. However, those prior work either incurs significant performance degradation or introduces large area overhead. In this paper, we propose a novel memory architecture Half-DRAM, in which the DRAM array is reorganized to enable only half of a row being activated. The half-row activation can effectively reduce activation power and meanwhile sustain the full bandwidth one bank can provide. In addition, the half-row activation in Half-DRAM relaxes the power constraint in DRAM, and opens up opportunities for further performance gain. Furthermore, two half-row accesses can be issued in parallel by integrating the sub-array level parallelism to improve the memory level parallelism. The experimental results show that Half-DRAM can achieve both significant performance improvement and power reduction, with negligible design overhead", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Kim:2014:FBM, author = "Yoongu Kim and Ross Daly and Jeremie Kim and Chris Fallin and Ji Hye Lee and Donghyuk Lee and Chris Wilkerson and Konrad Lai and Onur Mutlu", title = "Flipping bits in memory without accessing them: an experimental study of {DRAM} disturbance errors", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "361--372", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665726", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Memory isolation is a key property of a reliable and secure computing system --- an access to one memory address should not have unintended side effects on data stored in other addresses. However, as DRAM process technology scales down to smaller dimensions, it becomes more difficult to prevent DRAM cells from electrically interacting with each other. In this paper, we expose the vulnerability of commodity DRAM chips to disturbance errors. By reading from the same address in DRAM, we show that it is possible to corrupt data in nearby addresses. More specifically, activating the same row in DRAM corrupts data in nearby rows. We demonstrate this phenomenon on Intel and AMD systems using a malicious program that generates many DRAM accesses. We induce errors in most DRAM modules (110 out of 129) from three major DRAM manufacturers. From this we conclude that many deployed systems are likely to be at risk. We identify the root cause of disturbance errors as the repeated toggling of a DRAM row's wordline, which stresses inter-cell coupling effects that accelerate charge leakage from nearby rows. We provide an extensive characterization study of disturbance errors and their behavior using an FPGA-based testing platform. Among our key findings, we show that (i) it takes as few as 139K accesses to induce an error and (ii) up to one in every 1.7K cells is susceptible to errors. After examining various potential ways of addressing the problem, we propose a low-overhead solution to prevent the errors", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Zhang:2014:AIP, author = "Runjie Zhang and Ke Wang and Brett H. Meyer and Mircea R. Stan and Kevin Skadron", title = "Architecture implications of pads as a scarce resource", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "373--384", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665728", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Due to non-ideal technology scaling, delivering a stable supply voltage is increasingly challenging. Furthermore, competition for limited chip interface resources (i.e., C4 pads) between power supply and I/O, and the loss of such resources to electromigration, means that constructing a power delivery network (PDN) that satisfies noise margins without compromising performance is and will remain a critical problem for architects and circuit designers alike. Simple guardbanding will no longer work, as the consequent performance penalty will grow with technology scaling In this paper, we develop a pre-RTL PDN model, VoltSpot, for the purpose of studying the performance and noise tradeoffs among power supply and I/O pad allocation, the effectiveness of noise mitigation techniques, and the consequent implications of electromigration-induced PDN pad failure. Our simulations demonstrate that, despite their integral role in the PDN, power/ground pads can be aggressively reduced (by conversion into I/O pads) to their electromigration limit with minimal performance impact from extra voltage noise --- provided the system implements a suitable noise-mitigation strategy. The key observation is that even though reducing power/ground pads significantly increases the number of voltage emergencies, the average noise amplitude increase is small. Overall, we can triple I/O bandwidth while maintaining target lifetimes and incurring only 1.5\% slowdown", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Chen:2014:ICB, author = "Shaoming Chen and Yue Hu and Ying Zhang and Lu Peng and Jesse Ardonne and Samuel Irving and Ashok Srivastava", title = "Increasing off-chip bandwidth in multi-core processors with switchable pins", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "385--396", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665730", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Off-chip memory bandwidth has been considered as one of the major limiting factors to processor performance, especially for multi-cores and many-cores. Conventional processor design allocates a large portion of off-chip pins to deliver power, leaving a small number of pins for processor signal communication. We observed that the processor requires much less power than that can be supplied during memory intensive stages. This is due to the fact that the frequencies of processor cores waiting for data to be fetched from off-chip memories can be scaled down in order to save power without degrading performance. In this work, motivated by this observation, we propose a dynamic pin switch technique to alleviate the bandwidth limitation issue. The technique is introduced to dynamically exploit the surplus pins for power delivery in the memory intensive phases and uses them to provide extra bandwidth for the program executions, thus significantly boosting the performance", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Jiang:2014:LPR, author = "Lei Jiang and Bo Zhao and Jun Yang and Youtao Zhang", title = "A low power and reliable charge pump design for phase change memories", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "397--408", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665731", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The emerging Phase Change Memory (PCM) technology exhibits excellent scalability and density potentials. At the same time, they require high current and high voltages to switch cell states. Their working voltages are provided by CMOS-compatible on-chip charge pumps (CPs). Unfortunately, CPs and particularly those for RESET, have a large parasitic power (a dominant component in total power loss) during operations, which significantly degrades their energy efficiency. In addition, CPs seriously suffer from the Time-Dependent Dielectric Breakdown (TDDB) problem due to their boosted operation voltage. To maintain a reasonable lifetime of CPs, existing solutions actively switch them on per-operation basis, resulting in large performance degradation In this paper, we address the above issues through two designs --- Reset_Sch (RESET scheduling) and CP_Sch (CP scheduling). Reset_Sch schedules when to perform a RESET for different cells upon writing a PCM line. It significantly reduces the power loss, and peak working power of RESET CP. CP_Sch incorporates a fast READ CP design to provide fast charge-up time for reads and minimize performance penalty. Our experimental results show that on average, 70\% of power loss for RESET CP can be reduced; and performance loss can be reduced from 16\% to 2\% while achieving a 16\% improvement in reliability", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Voskuilen:2014:FCP, author = "Gwendolyn Voskuilen and T. N. Vijaykumar", title = "{Fractal++}: closing the performance gap between fractal and conventional coherence", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "409--420", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665733", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Cache coherence protocol bugs can cause multicores to fail. Existing coherence verification approaches incur state explosion at small scales or require considerable human effort. As protocols' complexity and multicores' core counts increase, verification continues to be a challenge. Recently, researchers proposed fractal coherence which achieves scalable verification by enforcing observational equivalence between sub-systems in the coherence protocol. A larger subsystem is verified implicitly if a smaller sub-system has been verified. Unfortunately, fractal protocols suffer from two fundamental limitations: (1) indirect-communication: sub-systems cannot directly communicate and (2) partially-serial invalidations: cores must be invalidated in a specific, serial order. These limitations disallow common performance optimizations used by conventional directory protocols: reply forwarding where caches communicate directly and parallel invalidations. Therefore, fractal protocols lack performance scalability while directory protocols lack verification scalability. To enable both performance and verification scalability, we propose Fractal++ which employs a new class of protocol optimizations for verification-constrained architectures: decoupled-replies, contention-hints, and fully-parallel-fractal-invalidations. The first two optimizations allow reply-forwarding-like performance while the third optimization enables parallel invalidations in fractal protocols. Unlike conventional protocols, Fractal++ preserves observational equivalence and hence is scalably verifiable. In 32-core simulations of single- and four-socket systems, Fractal++ performs nearly as well as a directory protocol while providing scalable verifiability whereas the best-performing previous fractal protocol performs 8\% on average and up to 26\% worse with a single-socket and 12\% on average and up to 34\% worse with a longer-latency multi-socket system", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Qian:2014:ODB, author = "Xuehai Qian and Benjamin Sahelices and Josep Torrellas", title = "{OmniOrder}: directory-based conflict serialization of transactions", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "421--432", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665734", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Effective execution of atomic blocks of instructions (also called transactions) can enhance the performance and programmability of multiprocessors. Atomic blocks can be demarcated in software as in Transactional Memory (TM) or dynamically generated by the hardware as in aggressive implementations of strict memory consistency. In most current designs, when two atomic blocks conflict, one is squashed --- a performance loss that is often unnecessary. To avoid this waste, this paper presents OmniOrder, the first design that efficiently executes conflicting atomic blocks concurrently in a directory-based coherence environment. The idea is to keep only non-speculative data in the caches and, when the cache coherence protocol transfers a line, include in the message the history of speculative updates to the line. The coherence protocol transitions are unmodified. We evaluate OmniOrder with 64-core simulations. In a TM environment, OmniOrder reduces the execution time of the STAMP applications by an average of 18.4\% over a scheme that squashes on conflict. In an environment with SC enforcement with speculation, we run 11 programs that implement concurrent algorithms. OmniOrder reduces the programs' execution time by an average of 15.3\% relative to a scheme that squashes on conflict. Finally, OmniOrder's communication overhead of transferring the history of speculative updates is negligible", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Qian:2014:PRR, author = "Xuehai Qian and Benjamin Sahelices and Depei Qian", title = "{Pacifier}: record and replay for relaxed-consistency multiprocessors with distributed directory protocol", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "433--444", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665736", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Record and Deterministic Replay (R\&R) of multithreaded programs on relaxed-consistency multiprocessors with distributed directory protocol has been a long-standing open problem. The independently developed RelaxReplay [8] solves the problem by assuming write atomicity. This paper proposes Pacifier, the first R\&R scheme to provide a solution without assuming write atomicity. R\&R for relaxed-consistency multiprocessors needs to detect, record and replay Sequential Consistency Violations (SCV). Pacifier has two key components: (i) Relog, a general memory reordering logging and replay mechanism that can reproduce SCVs in relaxed memory models, and (ii) Granule, an SCV detection scheme in the record phase with good precision, that indicates whether to record with Relog. We show that Pacifier is a sweet spot in the design space with a reasonable trade-off between hardware and log overhead. An evaluation with simulations of 16, 32 and 64 processors with Release Consistency (RC) running SPLASH-2 applications indicates that Pacifier incurs 3.9\% ~ 16\% larger logs. The slowdown of Pacifier during replay is 10.1\% ~ 30.5\% compared to native execution", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Honarmand:2014:RDL, author = "Nima Honarmand and Josep Torrellas", title = "Replay debugging: leveraging record and replay for program debugging", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "445--456", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665737", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Hardware-assisted Record and Deterministic Replay (RnR) of programs has been proposed as a primitive for debugging hard-to-repeat software bugs. However, simply providing support for repeatedly stumbling on the same bug does not help diagnose it. For bug diagnosis, developers typically want to modify the code, e.g., by creating and operating on new variables, or printing state. Unfortunately, this renders the RnR log inconsistent and makes Replay Debugging (i.e., debugging while using an RnR log for replay) dicey at best This paper presents rdb, the first scheme for replay debugging that guarantees exact replay. rdb relies on two mechanisms. The first one is compiler support to split the instrumented application into two executables: one that is identical to the original program binary, and another that encapsulates all the added debug code. The second mechanism is a runtime infrastructure that replays the application and, without affecting it in any way, invokes the appropriate debug code at the appropriate locations. We describe an implementation of rdb based on LLVM and Pin, and show an example of how rdb's replay debugging helps diagnose a real bug", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Woodruff:2014:CCM, author = "Jonathan Woodruff and Robert N. M. Watson and David Chisnall and Simon W. Moore and Jonathan Anderson and Brooks Davis and Ben Laurie and Peter G. Neumann and Robert Norton and Michael Roe", title = "The {CHERI} capability model: revisiting {RISC} in an age of risk", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "457--468", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665740", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Motivated by contemporary security challenges, we reevaluate and refine capability-based addressing for the RISC era. We present CHERI, a hybrid capability model that extends the 64-bit MIPS ISA with byte-granularity memory protection. We demonstrate that CHERI enables language memory model enforcement and fault isolation in hardware rather than software, and that the CHERI mechanisms are easily adopted by existing programs for efficient in-program memory safety. In contrast to past capability models, CHERI complements, rather than replaces, the ubiquitous page-based protection mechanism, providing a migration path towards deconflating data-structure protection and OS memory management. Furthermore, CHERI adheres to a strict RISC philosophy: it maintains a load-store architecture and requires only single-cycle instructions, and supplies protection primitives to the compiler, language runtime, and operating system. We demonstrate a mature FPGA implementation that runs the FreeBSD operating system with a full range of software and an open-source application suite compiled with an extended LLVM to use CHERI memory protection. A limit study compares published memory safety mechanisms in terms of instruction count and memory overheads. The study illustrates that CHERI is performance-competitive even while providing assurance and greater flexibility with simpler hardware", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Vilanova:2014:CPS, author = "Llu{\"\i}s Vilanova and Muli Ben-Yehuda and Nacho Navarro and Yoav Etsion and Mateo Valero", title = "{CODOMs}: protecting software with code-centric memory domains", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "469--480", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665741", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Today's complex software systems are neither secure nor reliable. The rudimentary software protection primitives provided by current hardware forces systems to run many distrusting software components (e.g., procedures, libraries, plugins, modules) in the same protection domain, or otherwise suffer degraded performance from address space switches. We present CODOMs (COde-centric memory DOMains), a novel architecture that can provide finer-grained isolation between software components with effectively zero run-time overhead, all at a fraction of the complexity of other approaches. An implementation of CODOMs in a cycle-accurate full-system x86 simulator demonstrates that with the right hardware support, finer-grained protection and run-time performance can peacefully coexist.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Perais:2014:EPW, author = "Arthur Perais and Andr{\'e} Seznec", title = "{EOLE}: paving the way for an effective implementation of value prediction", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "481--492", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665742", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Even in the multicore era, there is a continuous demand to increase the performance of single-threaded applications. However, the conventional path of increasing both issue width and instruction window size inevitably leads to the power wall. Value prediction (VP) was proposed in the mid 90's as an alternative path to further enhance the performance of wide-issue superscalar processors. Still, it was considered up to recently that a performance-effective implementation of Value Prediction would add tremendous complexity and power consumption in almost every stage of the pipeline Nonetheless, recent work in the field of VP has shown that given an efficient confidence estimation mechanism, prediction validation could be removed from the out-of-order engine and delayed until commit time. As a result, recovering from mispredictions via selective replay can be avoided and a much simpler mechanism --- pipeline squashing --- can be used, while the out-of-order engine remains mostly unmodified. Yet, VP and validation at commit time entails strong constraints on the Physical Register File. Write ports are needed to write predicted results and read ports are needed in order to validate them at commit time, potentially rendering the overall number of ports unbearable. Fortunately, VP also implies that many single-cycle ALU instructions have their operands predicted in the front-end and can be executed in-place, in-order. Similarly, the execution of single-cycle instructions whose result has been predicted can be delayed until commit time since predictions are validated at commit time Consequently, a significant number of instructions --- 10\% to 60\% in our experiments --- can bypass the out-of-order engine, allowing the reduction of the issue width, which is a major contributor to both out-of-order engine complexity and register file port requirement. This reduction paves the way for a truly practical implementation of Value Prediction. Furthermore, since Value Prediction in itself usually increases performance, our resulting {Early | Out-of-Order | Late} Execution architecture, EOLE, is often more efficient than a baseline VP-augmented 6-issue superscalar while having a significantly narrower 4-issue out-of-order engine", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Czechowski:2014:IEE, author = "Kenneth Czechowski and Victor W. Lee and Ed Grochowski and Ronny Ronen and Ronak Singhal and Richard Vuduc and Pradeep Dubey", title = "Improving the energy efficiency of big cores", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "493--504", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665743", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Traditionally, architectural innovations designed to boost single-threaded performance incur overhead costs which significantly increase power consumption. In many cases the increase in power exceeds the improvement in performance, resulting in a net increase in energy consumption. Thus, it is reasonable to assume that modern attempts to improve single-threaded performance will have a negative impact on energy efficiency. This has led to the belief that ``Big Cores'' are inherently inefficient. To the contrary, we present a study which finds that the increased complexity of the core microarchitecture in recent generations of the IntelR CoreTM processor have reduced both the time and energy required to run various workloads. Moreover, taking out the impact of process technology changes, our study still finds the architecture and microarchitecture changes --- such as the increase in SIMD width, addition of the frontend caches, and the enhancement to the out-of-order execution engine --- account for 1.2x improvement in energy efficiency for these processors. This paper provides real-world examples of how architectural innovations can mitigate inefficiencies associated with ``Big Cores'' --- for example, micro-op caches obviate the costly decode of complex x86 instructions --- resulting in a core architecture that is both high performance and energy efficient. It also contributes to the understanding of how microarchitecture affects performance, power and energy efficiency by modeling the relationship between them", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{StAmant:2014:GPC, author = "Ren{\'e}e {St. Amant} and Amir Yazdanbakhsh and Jongse Park and Bradley Thwaites and Hadi Esmaeilzadeh and Arjang Hassibi and Luis Ceze and Doug Burger", title = "General-purpose code acceleration with limited-precision analog computation", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "505--516", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665746", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As improvements in per-transistor speed and energy efficiency diminish, radical departures from conventional approaches are becoming critical to improving the performance and energy efficiency of general-purpose processors. We propose a solution from circuit to compiler --- that enables general-purpose use of limited-precision, analog hardware to accelerate ``approximable'' code --- code that can tolerate imprecise execution. We utilize an algorithmic transformation that automatically converts approximable regions of code from a von Neumann model to an ``analog'' neural model. We outline the challenges of taking an analog approach, including restricted-range value encoding, limited precision in computation, circuit inaccuracies, noise, and constraints on supported topologies. We address these limitations with a combination of circuit techniques, a hardware/software interface, neural network training techniques, and compiler support. Analog neural acceleration provides whole application speedup of 3.7x and energy savings of 6.3x with quality loss less than 10\% for all except one benchmark. These results show that using limited-precision analog circuits for code acceleration, through a neural approach, is both feasible and beneficial over a range of approximation-tolerant, emerging applications including financial analysis, signal processing, robotics, 3D gaming, compression, and image processing", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Madhavan:2014:RLH, author = "Advait Madhavan and Timothy Sherwood and Dmitri Strukov", title = "Race logic: a hardware acceleration for dynamic programming algorithms", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "517--528", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665747", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We propose a novel computing approach, dubbed ``Race Logic'', in which information, instead of being represented as logic levels, as is done in conventional logic, is represented as a timing delay. Under this new information representation, computations can be performed by observing the relative propagation times of signals injected into the circuit (i.e. the outcome of races). Race Logic is especially suited for solving problems related to the", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Arnau:2014:ERF, author = "Jose-Maria Arnau and Joan-Manuel Parcerisa and Polychronis Xekalakis", title = "Eliminating redundant fragment shader executions on a mobile {GPU} via hardware memoization", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "529--540", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665748", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Redundancy is at the heart of graphical applications. In fact, generating an animation typically involves the succession of extremely similar images. In terms of rendering these images, this behavior translates into the creation of many fragment programs with the exact same input data. We have measured this fragment redundancy for a set of commercial Android applications, and found that more than 40\% of the fragments used in a frame have been already computed in a prior frame. In this paper we try to exploit this redundancy, using fragment memoization. Unfortunately, this is not an easy task as most of the redundancy exists across frames, rendering most HW based schemes unfeasible. We thus first take a step back and try to analyze the temporal locality of the redundant fragments, their complexity, and the number of inputs typically seen in fragment programs. The result of our analysis is a task level memoization scheme, that easily outperforms the current state-of-the-art in low power GPUs More specifically, our experimental results show that our scheme is able to remove 59.7\% of the redundant fragment computations on average. This materializes to a significant speedup of 17.6\% on average, while also improving the overall energy efficiency by 8.9\% on average.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Zhu:2014:WAS, author = "Yuhao Zhu and Vijay Janapa Reddi", title = "{WebCore}: architectural support for mobile {Web} browsing", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "541--552", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665749", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The Web browser is undoubtedly the single most important application in the mobile ecosystem. An average user spends 72 minutes each day using the mobile Web browser. nWeb browser internal engines (e.g., WebKit) are also growing in importance because they provide a common substrate for developing various mobile Web applications. In a user-driven, interactive, and latency-sensitive environment, the browser's performance is crucial. However, the battery-constrained nature of mobile devices limits the performance that we can deliver for mobile Web browsing. As traditional general-purpose techniques to improve performance and energy efficiency fall short, we must employ domain-specific knowledge while still maintaining general-purpose flexibility In this paper, we first perform design-space exploration to identify appropriate general-purpose architectures that uniquely fit the characteristics of a popular Web browsing engine. Despite our best effort, we discover sources of energy inefficiency in these customized general-purpose architectures. To mitigate these inefficiencies, we propose, synthesize, and evaluate two new domain-specific specializations, called the Style Resolution Unit and the Browser Engine Cache. Our optimizations boost energy efficiency and at the same time improve mobile Web browsing performance. As emerging mobile workloads increasingly rely more on Web browser technologies, the type of optimizations we propose will become important in the future and are likely to have lasting widespread impact", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '14 conference proceedings.", } @Article{Kodama:2014:PFB, author = "Yuetsu Kodama and Toshihiro Hanawa and Taisuke Boku and Mitsuhisa Sato", title = "{PEACH2}: an {FPGA}-based {PCIe} network device for Tightly Coupled Accelerators", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "3--8", month = sep, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693716", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:35 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In recent years, heterogeneous clusters using accelerators are often used for high performance computing systems. In such clusters, inter-node communication between accelerators requires several memory copies via CPU memory, and the communication latency incurred severely reduces performance. To solve this problem, we have been proposing a Tightly Coupled Accelerators (TCA) architecture intended to reduce the communication latency between accelerators over different nodes. In the TCA architecture, PCI Express packets are used for communication among GPUs over nodes. We developed a communication chip that we call the named PEACH2 chip, to help implement the TCA architecture. In this paper, we describe the details of the design and implementation of the PEACH2 chip, with respect to its routing mechanism and its DMA controller using FPGA. We evaluated the PEACH2 on a new platform that uses the latest Xeon CPU, IvyBridge, and achieved 2.3 GBytes/sec between GPUs over nodes, while the performance was only 880 MBytes/sec on the previous platform with SandyBridge.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Nomura:2014:PAM, author = "Shimpei Nomura and Takuji Mitsuishi and Jun Suzuki and Yuki Hayashi and Masaki Kan and Hideharu Amano", title = "Performance Analysis of the {Multi-GPU} System with {ExpEther}", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "9--14", month = sep, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693717", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:35 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "A GPU cluster in which each node provides a few GPUs connected with PCIe (PCI Express) is commonly used for acceleration of a large application program requiring the performance beyond a single GPU. However, in such a system, programmers are required to describe two parallel programming between nodes in MPIs or other message passing library as well as the fine grained parallel programming for intra-GPUs. As a cost effective alternative of such clusters, we propose a novel multi-GPU system with ExpEther, a virtualization technique which extends PCIe of a host CPU to Ethernet. All devices connected by ExpEther can be treated as if they were directly connected to the host. Evaluation with two application programs with and without GPU-GPU communication revealed that the proposed system with four GPUs achieved 3.88 and 3.29 times performance improvement respectively compared with a single GPU system. Compared with GPU cluster system in which each node provides a GPU, the proposed system achieved about 7\% and 30\% performance improvement, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Watanabe:2014:GAH, author = "Tsuyoshi Watanabe and Naohito Nakasato", title = "{GPU} Accelerated Hybrid Tree Algorithm for Collision Less {$N$}-body Simulations", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "15--20", month = sep, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693718", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:35 MDT 2015", bibsource = "https://www.math.utah.edu/pub/bibnet/subjects/fastmultipole.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We propose a hybrid tree algorithm for reducing calculation and communication cost of collision-less N-body simulations. The concept of our algorithm is that we split interaction force into two parts: hard-force from neighbor particles and soft-force from distant particles, and applying different time integration for the forces. For hard-force calculation, we can efficiently reduce the calculation and communication cost of the parallel tree code because we only need data of neighbor particles for this part. We implement the algorithm on GPU clusters to accelerate force calculation for both hard and soft force. As the result of implementing the algorithm on GPU clusters, we were able to reduce the communication cost and the total execution time to 40\% and 80\% of that of a normal tree algorithm, respectively. In addition, the reduction factor relative the normal tree algorithm is smaller for large number of processes, and we expect that the execution time can be ultimately reduced down to about 70\% of the normal tree algorithm.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Tsuyama:2014:GFA, author = "Haruhisa Tsuyama and Tsutomu Maruyama", title = "{GPU} and {FPGA} Acceleration of Level Set Method", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "21--25", month = sep, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693719", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:35 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The level set method is one of the most powerful image segmentation methods. Its computational complexity, however, is very high, and many approaches to reduce the computation time have been proposed. In this paper, we describe a new level set algorithm for parallel processing, and its implementation on GPU and FPGA. The computational complexity of this algorithm is higher than previous algorithms, but it is possible to achieve higher performance by parallel processing. We implemented the algorithm on GeForce GTX780Ti, and Xilinx XC7VX485T, and compared their performances.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Tanabe:2014:FAO, author = "Yu Tanabe and Tsutomu Maruyama", title = "Fast and Accurate Optical Flow Estimation using {FPGA}", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "27--32", month = sep, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693720", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:35 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper, we extend an approach used in the stereo vision for the optical flow estimation to achieve lower error rates. In the optical flow estimation, two dimensional search is required, and more hardware resources becomes necessary than the stereo vision that requires only one dimensional search. In our implementation, the target image is divided into sub-images, and they are processed in turn to reduce the required circuit size. The error rates by our system is much lower than previous works, and its processing speed is fast enough for practical applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Torres-Huitzil:2014:AEI, author = "Cesar Torres-Huitzil and Marco Aurelio Nu{\~n}o-Maganda", title = "Area-time Efficient Implementation of Local Adaptive Image Thresholding in Reconfigurable Hardware", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "33--38", month = sep, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693721", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:35 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Local adaptive thresholding plays an important role in image binarization since it is used to effectively distinguish objects of interest from background regions. This step affects the performance of further processing stages in embedded computer vision applications. In local thresholding, a threshold is defined for each pixel as a function of all pixels within a rectangular neighborhood, and as a consequence, this yields a high computational cost requiring significant processing time when thresholding high resolution images or large data sets. This paper presents an area-time efficient hardware implementation of a local adaptive thresholding technique based on the Bernsen algorithm targeted to a field programmable gate array (FPGA) device. Experimental results show that the proposed implementation is resource efficient and able to process a 1024x1024 gray level image in less than 10 milliseconds independent of the neighborhood size. The architecture demonstrates over 100-fold speedup compared to a straightforward software implementation of the original Bernsen algorithm on a desktop computer.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Gohringer:2014:RMS, author = "Diana G{\"o}hringer", title = "Reconfigurable Multiprocessor Systems: Handling {Hydras} Heads --- A Survey", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "39--44", month = sep, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693722", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:35 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Novel solutions are needed to fulfill the increasing demands of embedded systems, i.e. lowering the energy consumption, increasing the performance, reducing the development time and keeping the costs as low as possible. In addition, there exist several applications, which require runtime adaptations of the algorithms based on the connection to its environment. These challenges can be solved by using reconfigurable Multiprocessor Systems-on-Chip (MPSoCs), which can adapt the hardware as well as the software to the application requirements and therefore achieve a high computational efficiency as well as a high flexibility. However, the development, the programming and the operation of such flexible and heterogeneous systems is very complex as the many criteria (Performance, power consumption, costs, development time, runtime adaptations, etc.) open a huge design space. In this paper an overview of the challenges faced when developing runtime adaptive MPSoCs is given. Finally, for each challenge a survey of possible solutions are presented.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Sano:2014:FBC, author = "Kentaro Sano and Ryotaro Chiba and Tomoya Ueno and Hayato Suzuki and Ryo Ito and Satoru Yamamoto", title = "{FPGA}-based Custom Computing Architecture for Large-Scale Fluid Simulation with Building Cube Method", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "45--50", month = sep, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693723", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:35 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We are designing a custom computing machine for large-scale fluid simulation with the building-cube method (BCM). In BCM, parallel computation is performed with cubes, each of which is an orthogonal grid with a fixed resolution of cells. Although BCM is advantageous in balancing loads with cubes, it also has a problem of efficiency and scalability for computing with general-purpose supercomputers due to insufficient memory bandwidth and communication overhead of an interconnection network. In this paper, we present a custom computing architecture for FPGA-based scalable BCM computation with a dedicated network, called an accelerator domain network (ADN). We design a cube engine which allows bandwidth-efficient computation of cubes based on streamed stencil computation of the fractional-step method. Through prototype implementation, we evaluate the potential performance of the architecture. For ALTERA Stratix V 28nm FPGA, we estimate that a single FPGA has the peak performance of 107 GFlop/s in a single precision.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Wang:2014:GRS, author = "Tao Wang and Guangyu Sun and Jiahua Chen and Jian Gong and Haoyang Wu and Xiaoguang Li and Songwu Lu and Jason Cong", title = "{GRT}: a Reconfigurable {SDR} Platform with High Performance and Usability", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "51--56", month = sep, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693724", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:35 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The importance of software-defined radio (SDR) continues to increase. However, existing SDR platforms become less efficient as the wireless industry moves towards Gigabit WiFi. In this work, we propose a novel reconfigurable SDR platform named GRT. With the help of reconfigurable architecture and corresponding software support, SDR designs on GRT can leverage high performance of the underlying hardware and provide sufficient usability, including the support for efficient modular design, commodity interface, good programmability, code reusability, etc. We implement an 802.11a/g WiFi system on GRT to evaluate its performance. The results demonstrate that GRT can achieve a substantial improvement in usability while still satisfying the performance requirement.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Ando:2014:CSF, author = "Yuki Ando and Masataka Ogawa and Yuya Mizoguchi and Kouta Kumagai and Miaw Torng-Der and Shinya Honda", title = "A Case Study of {FPGA Blokus Duo} Solver by System-Level Design", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "57--62", month = sep, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693725", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:35 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents a case study to design a Blokus Duo solver by using our system-level design toolkit named SystemBuilder. We start with a modeling of the Blokus nDuo solver by C language and communication APIs which are provided by SystemBuilder. Then, we iteratively verified and tuned the parameters in the solver by running the model on a general computer in order to improve the performance of the solver. Finally, the implementation on FPGA was automatically generated from the model by SystemBuilder. Despite the FPGA implementation, we have never written hardware description language throughout the case study. The case study demonstrates the easiness to design system on FPGA by System-level design tools.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Joldes:2014:SSH, author = "Mioara Joldes and Valentina Popescu and Warwick Tucker", title = "Searching for Sinks for the {H{\'e}non} Map using a Multiple-precision {GPU} Arithmetic Library", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "63--68", month = sep, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693726", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:35 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Today, GPUs represent an important hardware development platform for many problems in dynamical systems, where massive parallel computations are needed. Beside that, many numerical studies of chaotic dynamical systems require a computing precision higher than common floating point (FP) formats. One such application is locating invariant sets for chaotic dynamical systems. In particular, we focus on rigorously proving the existence of stable periodic orbits for the H{\'e}non map for parameter values close to the classical ones. For that, we present a multiple-precision floating-point arithmetic library in CUDA programming language for the NVIDIA GPU platform. Our library extends the precision using so-called FP expansions, where a number is represented as the unevaluated sum of standard machine precision FP numbers. This format offers the advantage of using directly available and highly optimized hardware FP operations. We generalize algorithms used by multiple-precisions libraries such as Bailey's QD, or the analogue GPU version, GQD.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Soejima:2014:MPF, author = "Rie Soejima and Koji Okina and Keisuke Dohi and Yuichiro Shibata and Kiyoshi Oguri", title = "A Memory Profiling Framework for Stencil Computation on an {FPGA} Accelerator with High Level Synthesis", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "69--74", month = sep, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693727", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:35 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper, we propose a framework to assist memory access optimization for stencil computation on an FPGA accelerator. Since the stencil computations such as scientific simulations need large amounts of data, efficient memory access is a key to achieving high performance on FPGA accelerators. Therefore, we implemented a stencil computation framework with a memory performance profiler on MaxCompiler, which is one of high level synthesis systems. The memory profiler enables us to measure clock cycles for various memory controller states; data transfer, stall, and idle. We also implemented simple stencil computations and practical FDTD electromagnetic field simulations on top of the framework with various parameters to evaluate and analyze memory performance. As a result of execution experiments of the simple stencil computations on a MAX34245A Data Flow Engine, it was demonstrated that approximately 70\% of the peak memory performance could be achieved for various stencil types. On the other hand, the FDTD simulations, which need many data streams, could not hit this memory performance saturation point, because of increasing complexity of memory controller modules. Through the analysis of evaluation results obtained by our memory performance profiling framework, a promising memory access optimization approach for stencil computations in which the complexity of the memory controller is traded off against data access traffic is suggested.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Morishima:2014:PEG, author = "Shin Morishima and Hiroki Matsutani", title = "Performance Evaluations of Graph Database using {CUDA} and {OpenMP} Compatible Libraries", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "75--80", month = sep, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693728", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:35 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Graph databases use graph structures to store data sets as nodes, edges, and properties. They are used to store and search the relationships between a large number of nodes, such as social networking services and recommendation engines that use customer social graphs. Since computation cost for graph search queries increases as the graph becomes large, in this paper we accelerate the graph search functions (Dijkstra and A* algorithms) of a graph database Neo4j using two ways: multi-threaded library and CUDA library for graphics processing units (GPUs). We use 100,000-node graphs generated based on a degree distribution of Facebook social graph for evaluations. Our multi-threaded and GPU-based implementations require an auxiliary adjacency matrix for a target graph. The results show that, when we do not take into account additional overhead to generate the auxiliary adjacency matrix, multi-threaded version improves the Dijkstra and A* search performance by 16.2x and 13.8x compared to the original implementation. The GPU-based implementation improves the Dijkstra and A* search performance by 26.2x and 32.8x. When we take into account the overhead, although the speed-ups by our implementations are reduced, by reusing the auxiliary adjacency matrix for multiple graph search queries we can significantly improve the graph search performance.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Mitsuishi:2014:ABF, author = "Takuji Mitsuishi and Shimpei Nomura and Jun Suzuki and Yuki Hayashi and Masaki Kan and Hideharu Amano", title = "Accelerating Breadth First Search on {GPU--BOX}", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "81--86", month = sep, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693729", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:35 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "The graph analysis has been applied in various fields related to big-data processing and actively researched in recent years. For processing a larger scale of graph, parallel computing with multi-GPU system is paid attention as an economical solution. Here, an efficient parallel method is proposed to solve a typical graph analysis, Breadth First Search (BFS) for multi-GPU systems. Our target system is GPU-BOX, a prototype of multi-GPU system using ExpEther which is a virtualization technology based on PCI Express and Ethernet. Although many vertices between GPUs must be exchanged to run BFS on multi-GPU system, GPU-BOX provides only small communication performance because of using Ethernet. Our parallel algorithm for BFS is designed so as to reduce the traffic between GPUs as possible. The proposed method reduced 30-40\% traffic between GPUs and improved the traditional parallel method by 10\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Nunez-Yanez:2014:EER, author = "Jose Nunez-Yanez", title = "Energy efficient Reconfigurable Computing with Adaptive Voltage and Logic scaling", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "87--92", month = sep, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693730", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:35 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper investigates a novel energy-proportional concept that combines closed-loop voltage scalability and run-time hardware reconfiguration. Voltage scaling is based on in-situ detectors that allow the device to detect valid working voltage and frequency pairs at run-time. The combined approach named AVLS (Adaptive Voltage and Logic Scaling) enables the adaptation of capacitance, voltage and frequency to obtain power and energy savings based on workload, process and operating conditions in a closed-loop configuration. The technique is applied to a reconfigurable motion estimation processor that can be configured with a variable number of execution units and it is used as a test vehicle. The results demonstrate that the proposed voltage scaling can obtain up to 85\% reduction in energy compared with nominal voltage operation at the same frequency. This efficient energy point is obtained at a voltage of 0.62 V and frequency of 56 MHz compared with running the core at the same frequency and nominal 1 V. The addition of logic scalability means that if enough device resources are available a parallel configuration with six execution units operating at 0.62 V reduces energy by up to 95\% compared with a single execution unit operating at 1 V and the same frequency.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Thorson:2014:INb, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "93--101", month = sep, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693732", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:35 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Thorson:2014:INc, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "93--101", year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693732", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '14 conference proceedings.", } @Article{Ozturk:2015:ASC, author = "Ozcan Ozturk", title = "Architectural Support for Cyber-Physical Systems", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "1--1", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694375", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Cyber-physical systems are integrations of computation, communication networks, and physical dynamics. Although time plays a central role in the physical world, all widely used software abstractions lack temporal semantics. The notion of correct execution of a program written in every widely-used programming language today does not depend on the temporal behavior of the program. But temporal behavior matters in almost all systems, and most particularly in cyber-physical systems. In this talk, I will argue that time can and must become part of the semantics of programs for a large class of applications. To illustrate that this is both practical and useful, we will describe a recent effort at Berkeley in the design and implementation of timing-centric software systems. Specifically, I will describe PRET machines, which redefine the instruction-set architecture (ISA) of a microprocessor to embrace temporal semantics. Such machines can be used in high-confidence and safety-critical systems, in energy-constrained systems, in mixed-criticality systems, and as a Real-Time Unit (RTU) that cooperates with a general-purpose processor to provide real-time services, in a manner similar to how a GPU provides graphics services.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Zhang:2015:MRH, author = "Yiying Zhang and Jian Yang and Amirsaman Memaripour and Steven Swanson", title = "{Mojim}: a Reliable and Highly-Available Non-Volatile Memory System", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "3--18", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694370", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Next-generation non-volatile memories (NVMs) promise DRAM-like performance, persistence, and high density. They can attach directly to processors to form non-volatile main memory (NVMM) and offer the opportunity to build very low-latency storage systems. These high-performance storage systems would be especially useful in large-scale data center environments where reliability and availability are critical. However, providing reliability and availability to NVMM is challenging, since the latency of data replication can overwhelm the low latency that NVMM should provide. We propose Mojim, a system that provides the reliability and availability that large-scale storage systems require, while preserving the performance of NVMM. Mojim achieves these goals by using a two-tier architecture in which the primary tier contains a mirrored pair of nodes and the secondary tier contains one or more secondary backup nodes with weakly consistent copies of data. Mojim uses highly-optimized replication protocols, software, and networking stacks to minimize replication costs and expose as much of NVMM?s performance as possible. We evaluate Mojim using raw DRAM as a proxy for NVMM and using an industrial NVMM emulation system. We find that Mojim provides replicated NVMM with similar or even better performance than un-replicated NVMM (reducing latency by 27\% to 63\% and delivering between 0.4 to 2.7X the throughput). We demonstrate that replacing MongoDB's built-in replication system with Mojim improves MongoDB's performance by 3.4 to 4X.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Wang:2015:SPC, author = "Rujia Wang and Lei Jiang and Youtao Zhang and Jun Yang", title = "{SD--PCM}: Constructing Reliable Super Dense Phase Change Memory under Write Disturbance", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "19--31", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694352", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Phase Change Memory (PCM) has better scalability and smaller cell size comparing to DRAM. However, further scaling PCM cell in deep sub-micron regime results in significant thermal based write disturbance (WD). Naively allocating large inter-cell space increases cell size from 4F$^2$ ideal to 12F$^2$. While a recent work mitigates WD along word-lines through disturbance resilient data encoding, it is ineffective for WD along bit-lines, which is more severe due to widely adopted $ \mu $Trench structure in constructing PCM cell arrays. Without mitigating WD along bit-lines, a PCM cell still has 8F2, which is 100\% larger than the ideal. In this paper, we propose SD-PCM for achieving reliable write operations in super dense PCM. In particular, we focus on mitigating WD along bit-lines such that we can construct super dense PCM chips with 4F$^2$ cell size, i.e., the minimal for diode-switch based PCM. Based on simple verification-n-correction (VnC), we propose LazyCorrection and PreRead to effectively reduce VnC overhead and minimize cascading verification during write. We further propose (n:m)-Alloc for achieving good tradeoff between VnC overhead minimization and memory capacity loss. Our experimental results show that, comparing to a WD-free low density PCM, SD-PCM achieves 80\% capacity improvement in cell arrays while incurring around 0-10\% performance degradation when using different (n:m) allocators.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Young:2015:DWE, author = "Vinson Young and Prashant J. Nair and Moinuddin K. Qureshi", title = "{DEUCE}: Write-Efficient Encryption for Non-Volatile Memories", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "33--44", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694387", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Phase Change Memory (PCM) is an emerging Non Volatile Memory (NVM) technology that has the potential to provide scalable high-density memory systems. While the non-volatility of PCM is a desirable property in order to save leakage power, it also has the undesirable effect of making PCM main memories susceptible to newer modes of security vulnerabilities, for example, accessibility to sensitive data if a PCM DIMM gets stolen. PCM memories can be made secure by encrypting the data. Unfortunately, such encryption comes with a significant overhead in terms of bits written to PCM memory, causing half of the bits in the line to change on every write, even if the actual number of bits being written to memory is small. Our studies show that a typical writeback modifies, on average, only 12\% of the bits in the cacheline. Thus, encryption causes almost a 4x increase in the number of bits written to PCM memories. Such extraneous bit writes cause significant increase in write power, reduction in write endurance, and reduction in write bandwidth. To provide the benefit of secure memory in a write efficient manner this paper proposes Dual Counter Encryption (DEUCE). DEUCE is based on the observation that a typical writeback only changes a few words, so DEUCE reencrypts only the words that have changed. We show that DEUCE reduces the number of modified bits per writeback for a secure memory from 50\% to 24\%, which improves performance by 27\% and increases lifetime by 2x.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Morrison:2015:TBT, author = "Adam Morrison and Yehuda Afek", title = "Temporally Bounding {TSO} for Fence-Free Asymmetric Synchronization", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "45--58", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694374", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper introduces a temporally bounded total store ordering (TBTSO) memory model, and shows that it enables nonblocking fence-free solutions to asymmetric synchronization problems, such as those arising in memory reclamation and biased locking. TBTSO strengthens the TSO memory model by bounding the time it takes a store to drain from the store buffer into memory. This bound enables devising fence-free algorithms for asymmetric problems, which require a performance-critical fast path to synchronize with an infrequently executed slow path. We demonstrate this by constructing (1) a fence-free version of the hazard pointers memory reclamation scheme, and (2) a fence-free biased lock algorithm which is compatible with unmanaged environments as it does not rely on safe points or similar mechanisms. We further argue that TBTSO can be implemented in hardware with modest modifications to existing TSO architectures. However, our design makes assumptions about proprietary implementation details of commercial hardware; it thus best serves as a starting point for a discussion on the feasibility of hardware TBTSO implementation. We also show how minimal OS support enables the adaptation of TBTSO algorithms to x86 systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Matveev:2015:RHN, author = "Alexander Matveev and Nir Shavit", title = "Reduced Hardware {NOrec}: a Safe and Scalable Hybrid Transactional Memory", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "59--71", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694393", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Because of hardware TM limitations, software fallbacks are the only way to make TM algorithms guarantee progress. Nevertheless, all known software fallbacks to date, from simple locks to sophisticated versions of the NOrec Hybrid TM algorithm, have either limited scalability or weakened semantics. We propose a novel reduced-hardware (RH) version of the NOrec HyTM algorithm. Instead of an all-software slow path, in our RH NOrec the slow-path is a ``mix'' of hardware and software: one short hardware transaction executes a maximal amount of initial reads in the hardware, and the second executes all of the writes. This novel combination of the RH approach and the NOrec algorithm delivers the first Hybrid TM that scales while fully preserving the hardware's original semantics of opacity and privatization. Our GCC implementation of RH NOrec is promising in that it shows improved performance relative to all prior methods, at the concurrency levels we could test today.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Orr:2015:SUR, author = "Marc S. Orr and Shuai Che and Ayse Yilmazer and Bradford M. Beckmann and Mark D. Hill and David A. Wood", title = "Synchronization Using Remote-Scope Promotion", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "73--86", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694350", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Heterogeneous system architecture (HSA) and OpenCL define scoped synchronization to facilitate low overhead communication across a subset of threads. Scoped synchronization works well for static sharing patterns, where consumer threads are known a priori. It works poorly for dynamic sharing patterns (e.g., work stealing) where programmers cannot use a faster small scope due to the rare possibility that the work is stolen by a thread in a distant slower scope. This puts programmers in a conundrum: optimize the common case by synchronizing at a faster small scope or use work stealing at a slower large scope. In this paper, we propose to extend scoped synchronization with remote-scope promotion. This allows the most frequent sharers to synchronize through a small scope. Infrequent sharers synchronize by promoting that remote small scope to a larger shared scope. Synchronization using remote-scope promotion provides performance robustness for dynamic workloads, where the benefits provided by scoped synchronization and work stealing are hard to anticipate. Compared to a na{\"\i}ve baseline, static scoped synchronization alone achieves a 1.07x speedup on average and dynamic work stealing alone achieves a 1.18x speedup on average. In contrast, synchronization using remote-scope promotion achieves a robust 1.25x speedup on average, across a diverse set of graph benchmarks and inputs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Liu:2015:GHS, author = "Chang Liu and Austin Harris and Martin Maas and Michael Hicks and Mohit Tiwari and Elaine Shi", title = "{GhostRider}: a Hardware-Software System for Memory Trace Oblivious Computation", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "87--101", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694385", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents a new, co-designed compiler and architecture called GhostRider for supporting privacy preserving computation in the cloud. GhostRider ensures all programs satisfy a property called memory-trace obliviousness (MTO): Even an adversary that observes memory, bus traffic, and access times while the program executes can learn nothing about the program's sensitive inputs and outputs. One way to achieve MTO is to employ Oblivious RAM (ORAM), allocating all code and data in a single ORAM bank, and to also disable caches or fix the rate of memory traffic. This baseline approach can be inefficient, and so GhostRider's compiler uses a program analysis to do better, allocating data to non-oblivious, encrypted RAM (ERAM) and employing a scratchpad when doing so will not compromise MTO. The compiler can also allocate to multiple ORAM banks, which sometimes significantly reduces access times.We have formalized our approach and proved it enjoys MTO. Our FPGA-based hardware prototype and simulation results show that GhostRider significantly outperforms the baseline strategy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Fletcher:2015:FON, author = "Christopher W. Fletcher and Ling Ren and Albert Kwon and Marten van Dijk and Srinivas Devadas", title = "Freecursive {ORAM}: [Nearly] Free Recursion and Integrity Verification for Position-based Oblivious {RAM}", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "103--116", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694353", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Oblivious RAM (ORAM) is a cryptographic primitive that hides memory access patterns as seen by untrusted storage. Recently, ORAM has been architected into secure processors. A big challenge for hardware ORAM schemes is how to efficiently manage the Position Map (PosMap), a central component in modern ORAM algorithms. Implemented naively, the PosMap causes ORAM to be fundamentally unscalable in terms of on-chip area. On the other hand, a technique called Recursive ORAM fixes the area problem yet significantly increases ORAM's performance overhead. To address this challenge, we propose three new mechanisms. We propose a new ORAM structure called the PosMap Lookaside Buffer (PLB) and PosMap compression techniques to reduce the performance overhead from Recursive ORAM empirically (the latter also improves the construction asymptotically). Through simulation, we show that these techniques reduce the memory bandwidth overhead needed to support recursion by 95\%, reduce overall ORAM bandwidth by 37\% and improve overall SPEC benchmark performance by 1.27x. We then show how our PosMap compression techniques further facilitate an extremely efficient integrity verification scheme for ORAM which we call PosMap MAC (PMMAC). For a practical parameterization, PMMAC reduces the amount of hashing needed for integrity checking by $ \geq 68 \times $ relative to prior schemes and introduces only 7\% performance overhead. We prototype our mechanisms in hardware and report area and clock frequency for a complete ORAM design post-synthesis and post-layout using an ASIC flow in a 32~nm commercial process. With 2 DRAM channels, the design post-layout runs at 1~GHz and has a total area of .47~mm2. Depending on PLB-specific parameters, the PLB accounts for 10\% to 26\% area. PMMAC costs 12\% of total design area. Our work is the first to prototype Recursive ORAM or ORAM with any integrity scheme in hardware.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Chisnall:2015:BPA, author = "David Chisnall and Colin Rothwell and Robert N. M. Watson and Jonathan Woodruff and Munraj Vadera and Simon W. Moore and Michael Roe and Brooks Davis and Peter G. Neumann", title = "Beyond the {PDP-11}: Architectural Support for a Memory-Safe {C} Abstract Machine", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "117--130", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694367", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We propose a new memory-safe interpretation of the C abstract machine that provides stronger protection to benefit security and debugging. Despite ambiguities in the specification intended to provide implementation flexibility, contemporary implementations of C have converged on a memory model similar to the PDP-11, the original target for C. This model lacks support for memory safety despite well-documented impacts on security and reliability. Attempts to change this model are often hampered by assumptions embedded in a large body of existing C code, dating back to the memory model exposed by the original C compiler for the PDP-11. Our experience with attempting to implement a memory-safe variant of C on the CHERI experimental microprocessor led us to identify a number of problematic idioms. We describe these as well as their interaction with existing memory safety schemes and the assumptions that they make beyond the requirements of the C specification. Finally, we refine the CHERI ISA and abstract model for C, by combining elements of the CHERI capability model and fat pointers, and present a softcore CPU that implements a C abstract machine that can run legacy C code with strong memory protection guarantees.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Ma:2015:SDS, author = "Jiuyue Ma and Xiufeng Sui and Ninghui Sun and Yupeng Li and Zihao Yu and Bowen Huang and Tianni Xu and Zhicheng Yao and Yun Chen and Haibin Wang and Lixin Zhang and Yungang Bao", title = "Supporting Differentiated Services in Computers via Programmable Architecture for Resourcing-on-Demand {(PARD)}", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "131--143", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694382", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents PARD, a programmable architecture for resourcing-on-demand that provides a new programming interface to convey an application's high-level information like quality-of-service requirements to the hardware. PARD enables new functionalities like fully hardware-supported virtualization and differentiated services in computers. PARD is inspired by the observation that a computer is inherently a network in which hardware components communicate via packets (e.g., over the NoC or PCIe). We apply principles of software-defined networking to this intra-computer network and address three major challenges. First, to deal with the semantic gap between high-level applications and underlying hardware packets, PARD attaches a high-level semantic tag (e.g., a virtual machine or thread ID) to each memory-access, I/O, or interrupt packet. Second, to make hardware components more manageable, PARD implements programmable control planes that can be integrated into various shared resources (e.g., cache, DRAM, and I/O devices) and can differentially process packets according to tag-based rules. Third, to facilitate programming, PARD abstracts all control planes as a device file tree to provide a uniform programming interface via which users create and apply tag-based rules. Full-system simulation results show that by co-locating latencycritical memcached applications with other workloads PARD can improve a four-core computer's CPU utilization by up to a factor of four without significantly increasing tail latency. FPGA emulation based on a preliminary RTL implementation demonstrates that the cache control plane introduces no extra latency and that the memory control plane can reduce queueing delay for high-priority memory-access requests by up to a factor of 5.6.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Omote:2015:IAE, author = "Yushi Omote and Takahiro Shinagawa and Kazuhiko Kato", title = "Improving Agility and Elasticity in Bare-metal Clouds", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "145--159", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694349", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Bare-metal clouds are an emerging infrastructure-as-a-service (IaaS) that leases physical machines (bare-metal instances) rather than virtual machines, allowing resource-intensive applications to have exclusive access to physical hardware. Unfortunately, bare-metal instances require time-consuming or OS-specific tasks for deployment due to the lack of virtualization layers, thereby sacrificing several beneficial features of traditional IaaS clouds such as agility, elasticity, and OS transparency. We present BMcast, an OS deployment system with a special-purpose de-virtualizable virtual machine monitor (VMM) that supports quick and OS-transparent startup of bare-metal instances. BMcast performs streaming OS deployment while allowing direct access to physical hardware from the guest OS, and then disappears after completing the deployment. Quick startup of instances improves agility and elasticity significantly, and OS transparency greatly simplifies management tasks for cloud customers. Experimental results have confirmed that BMcast initiated a bare-metal instance 8.6 times faster than image copying, and database performance on BMcast during streaming OS deployment was comparable to that on a state-of-the-art VMM without performing deployment. BMcast incurred zero overhead after de-virtualization.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Haque:2015:FMI, author = "Md E. Haque and Yong hun Eom and Yuxiong He and Sameh Elnikety and Ricardo Bianchini and Kathryn S. McKinley", title = "Few-to-Many: Incremental Parallelism for Reducing Tail Latency in Interactive Services", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "161--175", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694384", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Interactive services, such as Web search, recommendations, games, and finance, must respond quickly to satisfy customers. Achieving this goal requires optimizing tail (e.g., 99th+ percentile) latency. Although every server is multicore, parallelizing individual requests to reduce tail latency is challenging because (1) service demand is unknown when requests arrive; (2) blindly parallelizing all requests quickly oversubscribes hardware resources; and (3) parallelizing the numerous short requests will not improve tail latency. This paper introduces Few-to-Many (FM) incremental parallelization, which dynamically increases parallelism to reduce tail latency. FM uses request service demand profiles and hardware parallelism in an offline phase to compute a policy, represented as an interval table, which specifies when and how much software parallelism to add. At runtime, FM adds parallelism as specified by the interval table indexed by dynamic system load and request execution time progress. The longer a request executes, the more parallelism FM adds. We evaluate FM in Lucene, an open-source enterprise search engine, and in Bing, a commercial Web search engine. FM improves the 99th percentile response time up to 32\% in Lucene and up to 26\% in Bing, compared to prior state-of-the-art parallelization. Compared to running requests sequentially in Bing, FM improves tail latency by a factor of two. These results illustrate that incremental parallelism is a powerful tool for reducing tail latency.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Colp:2015:PDS, author = "Patrick Colp and Jiawen Zhang and James Gleeson and Sahil Suneja and Eyal de Lara and Himanshu Raj and Stefan Saroiu and Alec Wolman", title = "Protecting Data on {Smartphones} and Tablets from Memory Attacks", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "177--189", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694380", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Smartphones and tablets are easily lost or stolen. This makes them susceptible to an inexpensive class of memory attacks, such as cold-boot attacks, using a bus monitor to observe the memory bus, and DMA attacks. This paper describes Sentry, a system that allows applications and OS components to store their code and data on the System-on-Chip (SoC) rather than in DRAM. We use ARM-specific mechanisms originally designed for embedded systems, but still present in today's mobile devices, to protect applications and OS subsystems from memory attacks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Dautenhahn:2015:NKO, author = "Nathan Dautenhahn and Theodoros Kasampalis and Will Dietz and John Criswell and Vikram Adve", title = "Nested Kernel: an Operating System Architecture for Intra-Kernel Privilege Separation", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "191--206", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694386", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Monolithic operating system designs undermine the security of computing systems by allowing single exploits anywhere in the kernel to enjoy full supervisor privilege. The nested kernel operating system architecture addresses this problem by ``nesting'' a small isolated kernel within a traditional monolithic kernel. The ``nested kernel'' interposes on all updates to virtual memory translations to assert protections on physical memory, thus significantly reducing the trusted computing base for memory access control enforcement. We incorporated the nested kernel architecture into FreeBSD on x86-64 hardware while allowing the entire operating system, including untrusted components, to operate at the highest hardware privilege level by write-protecting MMU translations and de-privileging the untrusted part of the kernel. Our implementation inherently enforces kernel code integrity while still allowing dynamically loaded kernel modules, thus defending against code injection attacks. We also demonstrate that the nested kernel architecture allows kernel developers to isolate memory in ways not possible in monolithic kernels by introducing write-mediation and write-logging services to protect critical system data structures. Performance of the nested kernel prototype shows modest overheads: $< 1\%$ average for Apache and 2.7\% for kernel compile. Overall, our results and experience show that the nested kernel design can be retrofitted to existing monolithic kernels, providing important security benefits.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Tan:2015:DWS, author = "Zhangxi Tan and Zhenghao Qian and Xi Chen and Krste Asanovic and David Patterson", title = "{DIABLO}: a Warehouse-Scale Computer Network Simulator using {FPGAs}", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "207--221", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694362", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Motivated by rapid software and hardware innovation in warehouse-scale computing (WSC), we visit the problem of warehouse-scale network design evaluation. A WSC is composed of about 30 arrays or clusters, each of which contains about 3000 servers, leading to a total of about 100,000 servers per WSC. We found many prior experiments have been conducted on relatively small physical testbeds, and they often assume the workload is static and that computations are only loosely coupled with the adaptive networking stack. We present a novel and cost-efficient FPGAbased evaluation methodology, called Datacenter-In-A-Box at LOw cost (DIABLO), which treats arrays as whole computers with tightly integrated hardware and software. We have built a 3,000-node prototype running the full WSC software stack. Using our prototype, we have successfully reproduced a few WSC phenomena, such as TCP Incast and memcached request latency long tail, and found that results do indeed change with both scale and with version of the full software stack.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Hauswald:2015:SOE, author = "Johann Hauswald and Michael A. Laurenzano and Yunqi Zhang and Cheng Li and Austin Rovinski and Arjun Khurana and Ronald G. Dreslinski and Trevor Mudge and Vinicius Petrucci and Lingjia Tang and Jason Mars", title = "{Sirius}: an Open End-to-End Voice and Vision Personal Assistant and Its Implications for Future Warehouse Scale Computers", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "223--238", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694347", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As user demand scales for intelligent personal assistants (IPAs) such as Apple's Siri, Google's Google Now, and Microsoft's Cortana, we are approaching the computational limits of current datacenter architectures. It is an open question how future server architectures should evolve to enable this emerging class of applications, and the lack of an open-source IPA workload is an obstacle in addressing this question. In this paper, we present the design of Sirius, an open end-to-end IPA web-service application that accepts queries in the form of voice and images, and responds with natural language. We then use this workload to investigate the implications of four points in the design space of future accelerator-based server architectures spanning traditional CPUs, GPUs, manycore throughput co-processors, and FPGAs. To investigate future server designs for Sirius, we decompose Sirius into a suite of 7 benchmarks (Sirius Suite) comprising the computationally intensive bottlenecks of Sirius. We port Sirius Suite to a spectrum of accelerator platforms and use the performance and power trade-offs across these platforms to perform a total cost of ownership (TCO) analysis of various server design points. In our study, we find that accelerators are critical for the future scalability of IPA services. Our results show that GPU- and FPGA-accelerated servers improve the query latency on average by 10x and 16x. For a given throughput, GPU- and FPGA-accelerated servers can reduce the TCO of datacenters by 2.6x and 1.4x, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Xu:2015:ALD, author = "Chao Xu and Felix Xiaozhu Lin and Yuyang Wang and Lin Zhong", title = "Automated {OS}-level Device Runtime Power Management", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "239--252", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694360", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Non-CPU devices on a modern system-on-a-chip (SoC), ranging from accelerators to I/O controllers, account for a significant portion of the chip area. It is therefore vital for system energy efficiency that idle devices can enter a low-power state while still meeting the performance expectation. This is called device runtime Power Management (PM) for which individual device drivers in commodity OSes are held responsible today. Based on the observations of existing drivers and their evolution, we consider it harmful to rely on drivers for device runtime PM. This paper identifies three pieces of information as essential to device runtime PM, and shows that they can be obtained without involving drivers, either by using a software-only approach, or more efficiently, by adding one register bit to each device. We thus suggest a structural change to the current Linux runtime PM framework, replacing the PM code in all applicable drivers with a single kernel module called the central PM agent. Experimental evaluations show that the central PM agent is just as effective as hand-tuned driver PM code. The paper also presents a tool called PowerAdvisor that simplifies driver PM efforts under the current Linux runtime PM framework. PowerAdvisor analyzes execution traces and suggests where to insert PM calls in driver source code. Despite being a best-effort tool, PowerAdvisor not only reproduces hand-tuned PM code from stock drivers, but also correctly suggests PM code never known before. Overall, our experience shows that it is promising to ultimately free driver developers from manual PM.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Goiri:2015:CTV, author = "{\'I}{\~n}igo Goiri and Thu D. Nguyen and Ricardo Bianchini", title = "{CoolAir}: Temperature- and Variation-Aware Management for Free-Cooled Datacenters", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "253--265", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694378", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Despite its benefits, free cooling may expose servers to high absolute temperatures, wide temperature variations, and high humidity when datacenters are sited at certain locations. Prior research (in non-free-cooled datacenters) has shown that high temperatures and/or wide temporal temperature variations can harm hardware reliability. In this paper, we identify the runtime management strategies required to limit absolute temperatures, temperature variations, humidity, and cooling energy in free-cooled datacenters. As the basis for our study, we propose CoolAir, a system that embodies these strategies. Using CoolAir and a real free-cooled datacenter prototype, we show that effective management requires cooling infrastructures that can act smoothly. In addition, we show that CoolAir can tightly manage temperature and significantly reduce temperature variation, often at a lower cooling cost than existing free-cooled datacenters. Perhaps most importantly, based on our results, we derive several principles and lessons that should guide the design of management systems for free-cooled datacenters of any size.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Mishra:2015:PGM, author = "Nikita Mishra and Huazhe Zhang and John D. Lafferty and Henry Hoffmann", title = "A Probabilistic Graphical Model-based Approach for Minimizing Energy Under Performance Constraints", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "267--281", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694373", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In many deployments, computer systems are underutilized --- meaning that applications have performance requirements that demand less than full system capacity. Ideally, we would take advantage of this under-utilization by allocating system resources so that the performance requirements are met and energy is minimized. This optimization problem is complicated by the fact that the performance and power consumption of various system configurations are often application --- or even input --- dependent. Thus, practically, minimizing energy for a performance constraint requires fast, accurate estimations of application-dependent performance and power tradeoffs. This paper investigates machine learning techniques that enable energy savings by learning Pareto-optimal power and performance tradeoffs. Specifically, we propose LEO, a probabilistic graphical model-based learning system that provides accurate online estimates of an application's power and performance as a function of system configuration. We compare LEO to (1) offline learning, (2) online learning, (3) a heuristic approach, and (4) the true optimal solution. We find that LEO produces the most accurate estimates and near optimal energy savings.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Pang:2015:MLL, author = "Jun Pang and Chris Dwyer and Alvin R. Lebeck", title = "More is Less, Less is More: Molecular-Scale Photonic {NoC} Power Topologies", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "283--296", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694377", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Molecular-scale Network-on-Chip (mNoC) crossbars use quantum dot LEDs as an on-chip light source, and chromophores to provide optical signal filtering for receivers. An mNoC reduces power consumption or enables scaling to larger crossbars for a reduced energy budget compared to current nanophotonic NoC crossbars. Since communication latency is reduced by using a high-radix crossbar, minimizing power consumption becomes a primary design target. Conventional Single Writer Multiple Reader (SWMR) photonic crossbar designs broadcast all packets, and incur the commensurate required power, even if only two nodes are communicating. This paper introduces power topologies, enabled by unique capabilities of mNoC technology, to reduce overall interconnect power consumption. A power topology corresponds to the logical connectivity provided by a given power mode. Broadcast is one power mode and it consumes the maximum power. Additional power modes consume less power but allow a source to communicate with only a statically defined, potentially non-contiguous, subset of nodes. Overall interconnect power is reduced if the more frequently communicating nodes use modes that consume less power, while less frequently communicating nodes use modes that consume more power. We also investigate thread mapping techniques to fully exploit power topologies. We explore various mNoC power topologies with one, two and four power modes for a radix-256 SWMR mNoC crossbar. Our results show that the combination of power topologies and intelligent thread mapping can reduce total mNoC power by up to 51\% on average for a set of 12 SPLASH benchmarks. Furthermore performance is 10\% better than conventional resonator-based photonic NoCs and energy is reduced by 72\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Sridharan:2015:MEM, author = "Vilas Sridharan and Nathan DeBardeleben and Sean Blanchard and Kurt B. Ferreira and Jon Stearley and John Shalf and Sudhanva Gurumurthi", title = "Memory Errors in Modern Systems: The Good, The Bad, and The Ugly", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "297--310", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694348", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Several recent publications have shown that hardware faults in the memory subsystem are commonplace. These faults are predicted to become more frequent in future systems that contain orders of magnitude more DRAM and SRAM than found in current memory subsystems. These memory subsystems will need to provide resilience techniques to tolerate these faults when deployed in high-performance computing systems and data centers containing tens of thousands of nodes. Therefore, it is critical to understand the efficacy of current hardware resilience techniques to determine whether they will be suitable for future systems. In this paper, we present a study of DRAM and SRAM faults and errors from the field. We use data from two leadership-class high-performance computer systems to analyze the reliability impact of hardware resilience schemes that are deployed in current systems. Our study has several key findings about the efficacy of many currently deployed reliability techniques such as DRAM ECC, DDR address/command parity, and SRAM ECC and parity. We also perform a methodological study, and find that counting errors instead of faults, a common practice among researchers and data center operators, can lead to incorrect conclusions about system reliability. Finally, we use our data to project the needs of future large-scale systems. We find that SRAM faults are unlikely to pose a significantly larger reliability threat in the future, while DRAM faults will be a major concern and stronger DRAM resilience schemes will be needed to maintain acceptable failure rates similar to those found on today's systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Yetim:2015:CMC, author = "Yavuz Yetim and Sharad Malik and Margaret Martonosi", title = "{CommGuard}: Mitigating Communication Errors in Error-Prone Parallel Execution", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "311--323", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694354", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As semiconductor technology scales towards ever-smaller transistor sizes, hardware fault rates are increasing. Since important application classes (e.g., multimedia, streaming workloads) are data-error-tolerant, recent research has proposed techniques that seek to save energy or improve yield by exploiting error tolerance at the architecture/microarchitecture level. Even seemingly error-tolerant applications, however, will crash or hang due to control-flow/memory addressing errors. In parallel computation, errors involving inter-thread communication can have equally catastrophic effects. Our work explores techniques that mitigate the impact of potentially catastrophic errors in parallel computation, while still garnering power, cost, or yield benefits from data error tolerance. Our proposed CommGuard solution uses FSM-based checkers to pad and discard data in order to maintain semantic alignment between program control flow and the data communicated between processors. CommGuard techniques are low overhead and they exploit application information already provided by some parallel programming languages (e.g. StreamIt). By converting potentially catastrophic communication errors into potentially tolerable data errors, CommGuard allows important streaming applications like JPEG and MP3 decoding to execute without crashing and to sustain good output quality, even for errors as frequent as every 500 $ \mu $ s.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Kim:2015:DEF, author = "Dohyeong Kim and Yonghwi Kwon and William N. Sumner and Xiangyu Zhang and Dongyan Xu", title = "Dual Execution for On the Fly Fine Grained Execution Comparison", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "325--338", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694394", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Execution comparison has many applications in debugging, malware analysis, software feature identification, and intrusion detection. Existing comparison techniques have various limitations. Some can only compare at the system event level and require executions to take the same input. Some require storing instruction traces that are very space-consuming and have difficulty dealing with non-determinism. In this paper, we propose a novel dual execution technique that allows on-the-fly comparison at the instruction level. Only differences between the executions are recorded. It allows executions to proceed in a coupled mode such that they share the same input sequence with the same timing, reducing nondeterminism. It also allows them to proceed in a decoupled mode such that the user can interact with each one differently. Decoupled executions can be recoupled to share the same future inputs and facilitate further comparison. We have implemented a prototype and applied it to identifying functional components for reuse, comparative debugging with new GDB primitives, and understanding real world regression failures. Our results show that dual execution is a critical enabling technique for execution comparison.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Hosek:2015:VUE, author = "Petr Hosek and Cristian Cadar", title = "{VARAN} the Unbelievable: an Efficient {$N$}-version Execution Framework", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "339--353", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694390", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With the widespread availability of multi-core processors, running multiple diversified variants or several different versions of an application in parallel is becoming a viable approach for increasing the reliability and security of software systems. The key component of such N-version execution (NVX) systems is a runtime monitor that enables the execution of multiple versions in parallel. Unfortunately, existing monitors impose either a large performance overhead or rely on intrusive kernel-level changes. Moreover, none of the existing solutions scales well with the number of versions, since the runtime monitor acts as a performance bottleneck. In this paper, we introduce Varan, an NVX framework that combines selective binary rewriting with a novel event-streaming architecture to significantly reduce performance overhead and scale well with the number of versions, without relying on intrusive kernel modifications. Our evaluation shows that Varan can run NVX systems based on popular C10k network servers with only a modest performance overhead, and can be effectively used to increase software reliability using techniques such as transparent failover, live sanitization and multi-revision execution.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Malka:2015:REI, author = "Moshe Malka and Nadav Amit and Muli Ben-Yehuda and Dan Tsafrir", title = "{rIOMMU}: Efficient {IOMMU} for {I/O} Devices that Employ Ring Buffers", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "355--368", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694355", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The IOMMU allows the OS to encapsulate I/O devices in their own virtual memory spaces, thus restricting their DMAs to specific memory pages. The OS uses the IOMMU to protect itself against buggy drivers and malicious/errant devices. But the added protection comes at a cost, degrading the throughput of I/O-intensive workloads by up to an order of magnitude. This cost has motivated system designers to trade off some safety for performance, e.g., by leaving stale information in the IOTLB for a while so as to amortize costly invalidations. We observe that high-bandwidth devices---like network and PCIe SSD controllers---interact with the OS via circular ring buffers that induce a sequential, predictable workload. We design a ring IOMMU (rIOMMU) that leverages this characteristic by replacing the virtual memory page table hierarchy with a circular, flat table. A flat table is adequately supported by exactly one IOTLB entry, making every new translation an implicit invalidation of the former and thus requiring explicit invalidations only at the end of I/O bursts. Using standard networking benchmarks, we show that rIOMMU provides up to 7.56x higher throughput relative to the baseline IOMMU, and that it is within 0.77--1.00x the throughput of a system without IOMMU protection.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Liu:2015:PPM, author = "Daofu Liu and Tianshi Chen and Shaoli Liu and Jinhong Zhou and Shengyuan Zhou and Olivier Teman and Xiaobing Feng and Xuehai Zhou and Yunji Chen", title = "{PuDianNao}: a Polyvalent Machine Learning Accelerator", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "369--381", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694358", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Machine Learning (ML) techniques are pervasive tools in various emerging commercial applications, but have to be accommodated by powerful computer systems to process very large data. Although general-purpose CPUs and GPUs have provided straightforward solutions, their energy-efficiencies are limited due to their excessive supports for flexibility. Hardware accelerators may achieve better energy-efficiencies, but each accelerator often accommodates only a single ML technique (family). According to the famous No-Free-Lunch theorem in the ML domain, however, an ML technique performs well on a dataset may perform poorly on another dataset, which implies that such accelerator may sometimes lead to poor learning accuracy. Even if regardless of the learning accuracy, such accelerator can still become inapplicable simply because the concrete ML task is altered, or the user chooses another ML technique. In this study, we present an ML accelerator called PuDianNao, which accommodates seven representative ML techniques, including $k$-means, $k$-nearest neighbors, naive Bayes, support vector machine, linear regression, classification tree, and deep neural network. Benefited from our thorough analysis on computational primitives and locality properties of different ML techniques, PuDianNao can perform up to 1056 GOP/s (e.g., additions and multiplications) in an area of 3.51 mm^2, and consumes 596 mW only. Compared with the NVIDIA K20M GPU (28nm process), PuDianNao (65nm process) is 1.20x faster, and can reduce the energy by 128.41x.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Goiri:2015:ABA, author = "Inigo Goiri and Ricardo Bianchini and Santosh Nagarakatte and Thu D. Nguyen", title = "{ApproxHadoop}: Bringing Approximations to {MapReduce} Frameworks", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "383--397", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694351", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We propose and evaluate a framework for creating and running approximation-enabled MapReduce programs. Specifically, we propose approximation mechanisms that fit naturally into the MapReduce paradigm, including input data sampling, task dropping, and accepting and running a precise and a user-defined approximate version of the MapReduce code. We then show how to leverage statistical theories to compute error bounds for popular classes of MapReduce programs when approximating with input data sampling and/or task dropping. We implement the proposed mechanisms and error bound estimations in a prototype system called ApproxHadoop. Our evaluation uses MapReduce applications from different domains, including data analytics, scientific computing, video encoding, and machine learning. Our results show that ApproxHadoop can significantly reduce application execution time and/or energy consumption when the user is willing to tolerate small errors. For example, ApproxHadoop can reduce runtimes by up to 32x when the user can tolerate an error of 1\% with 95\% confidence. We conclude that our framework and system can make approximation easily accessible to many application domains using the MapReduce model.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Ringenburg:2015:MDQ, author = "Michael Ringenburg and Adrian Sampson and Isaac Ackerman and Luis Ceze and Dan Grossman", title = "Monitoring and Debugging the Quality of Results in Approximate Programs", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "399--411", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694365", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Energy efficiency is a key concern in the design of modern computer systems. One promising approach to energy-efficient computation, approximate computing, trades off output accuracy for significant gains in energy efficiency. However, debugging the actual cause of output quality problems in approximate programs is challenging. This paper presents dynamic techniques to debug and monitor the quality of approximate computations. We propose both offline debugging tools that instrument code to determine the key sources of output degradation and online approaches that monitor the quality of deployed applications. We present two offline debugging techniques and three online monitoring mechanisms. The first offline tool identifies correlations between output quality and the execution of individual approximate operations. The second tracks approximate operations that flow into a particular value. Our online monitoring mechanisms are complementary approaches designed for detecting quality problems in deployed applications, while still maintaining the energy savings from approximation. We present implementations of our techniques and describe their usage with seven applications. Our online monitors control output quality while still maintaining significant energy efficiency gains, and our offline tools provide new insights into the effects of approximation on output quality.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Banavar:2015:WEC, author = "Guruduth Banavar", title = "{Watson} and the Era of Cognitive Computing", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "413--413", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694376", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In the last decade, the availability of massive amounts of new data, and the development of new machine learning technologies, have augmented reasoning systems to give rise to a new class of computing systems. These ``Cognitive Systems'' learn from data, reason from models, and interact naturally with us, to perform complex tasks better than either humans or machines can do by themselves. In essence, cognitive systems help us perform like the best by penetrating the complexity of big data and leverage the power of models. One of the first cognitive systems, called Watson, demonstrated through a Jeopardy! exhibition match, that it was capable of answering complex factoid questions as effectively as the world's champions. Follow-on cognitive systems perform other tasks, such as discovery, reasoning, and multi-modal understanding in a variety of domains, such as healthcare, insurance, and education. We believe such cognitive systems will transform every industry and our everyday life for the better. In this talk, I will give an overview of the applications, the underlying capabilities, and some of the key challenges, of cognitive systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Stewart:2015:ZDW, author = "Gordon Stewart and Mahanth Gowda and Geoffrey Mainland and Bozidar Radunovic and Dimitrios Vytiniotis and Cristina Luengo Agullo", title = "{Ziria}: a {DSL} for Wireless Systems Programming", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "415--428", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694368", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Software-defined radio (SDR) brings the flexibility of software to wireless protocol design, promising an ideal platform for innovation and rapid protocol deployment. However, implementing modern wireless protocols on existing SDR platforms often requires careful hand-tuning of low-level code, which can undermine the advantages of software. Ziria is a new domain-specific language (DSL) that offers programming abstractions suitable for wireless physical (PHY) layer tasks while emphasizing the pipeline reconfiguration aspects of PHY programming. The Ziria compiler implements a rich set of specialized optimizations, such as lookup table generation and pipeline fusion. We also offer a novel --- due to pipeline reconfiguration --- algorithm to optimize the data widths of computations in Ziria pipelines. We demonstrate the programming flexibility of Ziria and the performance of the generated code through a detailed evaluation of a line-rate Ziria WiFi 802.11a/g implementation that is on par and in many cases outperforms a hand-tuned state-of-the-art C++ implementation on commodity CPUs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Mullapudi:2015:PAO, author = "Ravi Teja Mullapudi and Vinay Vasista and Uday Bondhugula", title = "{PolyMage}: Automatic Optimization for Image Processing Pipelines", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "429--443", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694364", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents the design and implementation of PolyMage, a domain-specific language and compiler for image processing pipelines. An image processing pipeline can be viewed as a graph of interconnected stages which process images successively. Each stage typically performs one of point-wise, stencil, reduction or data-dependent operations on image pixels. Individual stages in a pipeline typically exhibit abundant data parallelism that can be exploited with relative ease. However, the stages also require high memory bandwidth preventing effective utilization of parallelism available on modern architectures. For applications that demand high performance, the traditional options are to use optimized libraries like OpenCV or to optimize manually. While using libraries precludes optimization across library routines, manual optimization accounting for both parallelism and locality is very tedious. The focus of our system, PolyMage, is on automatically generating high-performance implementations of image processing pipelines expressed in a high-level declarative language. Our optimization approach primarily relies on the transformation and code generation capabilities of the polyhedral compiler framework. To the best of our knowledge, this is the first model-driven compiler for image processing pipelines that performs complex fusion, tiling, and storage optimization automatically. Experimental results on a modern multicore system show that the performance achieved by our automatic approach is up to 1.81x better than that achieved through manual tuning in Halide, a state-of-the-art language and compiler for image processing pipelines. For a camera raw image processing pipeline, our performance is comparable to that of a hand-tuned implementation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Heckey:2015:CMC, author = "Jeff Heckey and Shruti Patil and Ali JavadiAbhari and Adam Holmes and Daniel Kudrow and Kenneth R. Brown and Diana Franklin and Frederic T. Chong and Margaret Martonosi", title = "Compiler Management of Communication and Parallelism for Quantum Computation", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "445--456", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694357", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Quantum computing (QC) offers huge promise to accelerate a range of computationally intensive benchmarks. Quantum computing is limited, however, by the challenges of decoherence: i.e., a quantum state can only be maintained for short windows of time before it decoheres. While quantum error correction codes can protect against decoherence, fast execution time is the best defense against decoherence, so efficient architectures and effective scheduling algorithms are necessary. This paper proposes the Multi-SIMD QC architecture and then proposes and evaluates effective schedulers to map benchmark descriptions onto Multi-SIMD architectures. The Multi-SIMD model consists of a small number of SIMD regions, each of which may support operations on up to thousands of qubits per cycle. Efficient Multi-SIMD operation requires efficient scheduling. This work develops schedulers to reduce communication requirements of qubits between operating regions, while also improving parallelism.We find that communication to global memory is a dominant cost in QC. We also note that many quantum benchmarks have long serial operation paths (although each operation may be data parallel). To exploit this characteristic, we introduce Longest-Path-First Scheduling (LPFS) which pins operations to SIMD regions to keep data in-place and reduce communication to memory. The use of small, local scratchpad memories also further reduces communication. Our results show a 3\% to 308\% improvement for LPFS over conventional scheduling algorithms, and an additional 3\% to 64\% improvement using scratchpad memories. Our work is the most comprehensive software-to-quantum toolflow published to date, with efficient and practical scheduling techniques that reduce communication and increase parallelism for full-scale quantum code executing up to a trillion quantum gate operations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Hassaan:2015:KDG, author = "Muhammad Amber Hassaan and Donald D. Nguyen and Keshav K. Pingali", title = "Kinetic Dependence Graphs", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "457--471", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694363", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Task graphs or dependence graphs are used in runtime systems to schedule tasks for parallel execution. In problem domains such as dense linear algebra and signal processing, dependence graphs can be generated from a program by static analysis. However, in emerging problem domains such as graph analytics, the set of tasks and dependences between tasks in a program are complex functions of runtime values and cannot be determined statically. In this paper, we introduce a novel approach for exploiting parallelism in such programs. This approach is based on a data structure called the kinetic dependence graph (KDG), which consists of a dependence graph together with update rules that incrementally update the graph to reflect changes in the dependence structure whenever a task is completed. We have implemented a simple programming model that allows programmers to write these applications at a high level of abstraction, and a runtime within the Galois system [15] that builds the KDG automatically and executes the program in parallel. On a suite of programs that are difficult to parallelize otherwise, we have obtained speedups of up to 33 on 40 cores, out-performing third-party implementations in many cases.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Sidiroglou-Douskos:2015:TAI, author = "Stelios Sidiroglou-Douskos and Eric Lahtinen and Nathan Rittenhouse and Paolo Piselli and Fan Long and Deokhwan Kim and Martin Rinard", title = "Targeted Automatic Integer Overflow Discovery Using Goal-Directed Conditional Branch Enforcement", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "473--486", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694389", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We present a new technique and system, DIODE, for auto- matically generating inputs that trigger overflows at memory allocation sites. DIODE is designed to identify relevant sanity checks that inputs must satisfy to trigger overflows at target memory allocation sites, then generate inputs that satisfy these sanity checks to successfully trigger the overflow. DIODE works with off-the-shelf, production x86 binaries. Our results show that, for our benchmark set of applications, and for every target memory allocation site exercised by our seed inputs (which the applications process correctly with no overflows), either (1) DIODE is able to generate an input that triggers an overflow at that site or (2) there is no input that would trigger an overflow for the observed target expression at that site.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Dhawan:2015:ASS, author = "Udit Dhawan and Catalin Hritcu and Raphael Rubin and Nikos Vasilakis and Silviu Chiricescu and Jonathan M. Smith and Thomas F. {Knight, Jr.} and Benjamin C. Pierce and Andre DeHon", title = "Architectural Support for Software-Defined Metadata Processing", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "487--502", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694383", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Optimized hardware for propagating and checking software-programmable metadata tags can achieve low runtime overhead. We generalize prior work on hardware tagging by considering a generic architecture that supports software-defined policies over metadata of arbitrary size and complexity; we introduce several novel microarchitectural optimizations that keep the overhead of this rich processing low. Our model thus achieves the efficiency of previous hardware-based approaches with the flexibility of the software-based ones. We demonstrate this by using it to enforce four diverse safety and security policies---spatial and temporal memory safety, taint tracking, control-flow integrity, and code and data separation---plus a composite policy that enforces all of them simultaneously. Experiments on SPEC CPU2006 benchmarks with a PUMP-enhanced RISC processor show modest impact on runtime (typically under 10\%) and power ceiling (less than 10\%), in return for some increase in energy usage (typically under 60\%) and area for on-chip memory structures (110\%).", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Zhang:2015:HDL, author = "Danfeng Zhang and Yao Wang and G. Edward Suh and Andrew C. Myers", title = "A Hardware Design Language for Timing-Sensitive Information-Flow Security", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "503--516", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694372", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Information security can be compromised by leakage via low-level hardware features. One recently prominent example is cache probing attacks, which rely on timing channels created by caches. We introduce a hardware design language, SecVerilog, which makes it possible to statically analyze information flow at the hardware level. With SecVerilog, systems can be built with verifiable control of timing channels and other information channels. SecVerilog is Verilog, extended with expressive type annotations that enable precise reasoning about information flow. It also comes with rigorous formal assurance: we prove that SecVerilog enforces timing-sensitive noninterference and thus ensures secure information flow. By building a secure MIPS processor and its caches, we demonstrate that SecVerilog makes it possible to build complex hardware designs with verified security, yet with low overhead in time, space, and HW designer effort.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Hicks:2015:SLR, author = "Matthew Hicks and Cynthia Sturton and Samuel T. King and Jonathan M. Smith", title = "{SPECS}: a Lightweight Runtime Mechanism for Protecting Software from Security-Critical Processor Bugs", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "517--529", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694366", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Processor implementation errata remain a problem, and worse, a subset of these bugs are security-critical. We classified 7 years of errata from recent commercial processors to understand the magnitude and severity of this problem, and found that of 301 errata analyzed, 28 are security-critical. We propose the SECURITY-CRITICAL PROCESSOR ER- RATA CATCHING SYSTEM (SPECS) as a low-overhead solution to this problem. SPECS employs a dynamic verification strategy that is made lightweight by limiting protection to only security-critical processor state. As a proof-of- concept, we implement a hardware prototype of SPECS in an open source processor. Using this prototype, we evaluate SPECS against a set of 14 bugs inspired by the types of security-critical errata we discovered in the classification phase. The evaluation shows that SPECS is 86\% effective as a defense when deployed using only ISA-level state; incurs less than 5\% area and power overhead; and has no software run-time overhead.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Duan:2015:AMF, author = "Yuelu Duan and Nima Honarmand and Josep Torrellas", title = "Asymmetric Memory Fences: Optimizing Both Performance and Implementability", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "531--543", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694388", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "There have been several recent efforts to improve the performance of fences. The most aggressive designs allow post-fence accesses to retire and complete before the fence completes. Unfortunately, such designs present implementation difficulties due to their reliance on global state and structures. This paper's goal is to optimize both the performance and the implementability of fences. We start-off with a design like the most aggressive ones but without the global state. We call it Weak Fence or wF. Since the concurrent execution of multiple wFs can deadlock, we combine wFs with a conventional fence (i.e., Strong Fence or sF) for the less performance-critical thread(s). We call the result an Asymmetric fence group. We also propose a taxonomy of Asymmetric fence groups under TSO. Compared to past aggressive fences, Asymmetric fence groups both are substantially easier to implement and have higher average performance. The two main designs presented (WS+ and W+) speed-up workloads under TSO by an average of 13\% and 21\%, respectively, over conventional fences.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Sung:2015:DES, author = "Hyojin Sung and Sarita V. Adve", title = "{DeNovoSync}: Efficient Support for Arbitrary Synchronization without Writer-Initiated Invalidations", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "545--559", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694356", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Current shared-memory hardware is complex and inefficient. Prior work on the DeNovo coherence protocol showed that disciplined shared-memory programming models can enable more complexity-, performance-, and energy-efficient hardware than the state-of-the-art MESI protocol. DeNovo, however, severely restricted the synchronization constructs an application can support. This paper proposes DeNovoSync, a technique to support arbitrary synchronization in DeNovo. The key challenge is that DeNovo exploits race-freedom to use reader-initiated local self-invalidations (instead of conventional writer-initiated remote cache invalidations) to ensure coherence. Synchronization accesses are inherently racy and not directly amenable to self-invalidations. DeNovoSync addresses this challenge using a novel combination of registration of all synchronization reads with a judicious hardware backoff to limit unnecessary registrations. For a wide variety of synchronization constructs and applications, compared to MESI, DeNovoSync shows comparable or up to 22\% lower execution time and up to 58\% lower network traffic, enabling DeNovo's advantages for a much broader class of software than previously possible.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Sengupta:2015:HSD, author = "Aritra Sengupta and Swarnendu Biswas and Minjia Zhang and Michael D. Bond and Milind Kulkarni", title = "Hybrid Static-Dynamic Analysis for Statically Bounded Region Serializability", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "561--575", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694379", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Data races are common. They are difficult to detect, avoid, or eliminate, and programmers sometimes introduce them intentionally. However, shared-memory programs with data races have unexpected, erroneous behaviors. Intentional and unintentional data races lead to atomicity and sequential consistency (SC) violations, and they make it more difficult to understand, test, and verify software. Existing approaches for providing stronger guarantees for racy executions add high run-time overhead and/or rely on custom hardware. This paper shows how to provide stronger semantics for racy programs while providing relatively good performance on commodity systems. A novel hybrid static--dynamic analysis called \emph{EnfoRSer} provides end-to-end support for a memory model called \emph{statically bounded region serializability} (SBRS) that is not only stronger than weak memory models but is strictly stronger than SC. EnfoRSer uses static compiler analysis to transform regions, and dynamic analysis to detect and resolve conflicts at run time. By demonstrating commodity support for a reasonably strong memory model with reasonable overheads, we show its potential as an always-on execution model.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Alglave:2015:GCW, author = "Jade Alglave and Mark Batty and Alastair F. Donaldson and Ganesh Gopalakrishnan and Jeroen Ketema and Daniel Poetzl and Tyler Sorensen and John Wickerson", title = "{GPU} Concurrency: Weak Behaviours and Programming Assumptions", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "577--591", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694391", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Concurrency is pervasive and perplexing, particularly on graphics processing units (GPUs). Current specifications of languages and hardware are inconclusive; thus programmers often rely on folklore assumptions when writing software. To remedy this state of affairs, we conducted a large empirical study of the concurrent behaviour of deployed GPUs. Armed with litmus tests (i.e. short concurrent programs), we questioned the assumptions in programming guides and vendor documentation about the guarantees provided by hardware. We developed a tool to generate thousands of litmus tests and run them under stressful workloads. We observed a litany of previously elusive weak behaviours, and exposed folklore beliefs about GPU programming---often supported by official tutorials---as false. As a way forward, we propose a model of Nvidia GPU hardware, which correctly models every behaviour witnessed in our experiments. The model is a variant of SPARC Relaxed Memory Order (RMO), structured following the GPU concurrency hierarchy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Park:2015:CCP, author = "Jason Jong Kyu Park and Yongjun Park and Scott Mahlke", title = "{Chimera}: Collaborative Preemption for Multitasking on a Shared {GPU}", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "593--606", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694346", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The demand for multitasking on graphics processing units (GPUs) is constantly increasing as they have become one of the default components on modern computer systems along with traditional processors (CPUs). Preemptive multitasking on CPUs has been primarily supported through context switching. However, the same preemption strategy incurs substantial overhead due to the large context in GPUs. The overhead comes in two dimensions: a preempting kernel suffers from a long preemption latency, and the system throughput is wasted during the switch. Without precise control over the large preemption overhead, multitasking on GPUs has little use for applications with strict latency requirements. In this paper, we propose Chimera, a collaborative preemption approach that can precisely control the overhead for multitasking on GPUs. Chimera first introduces streaming multiprocessor (SM) flushing, which can instantly preempt an SM by detecting and exploiting idempotent execution. Chimera utilizes flushing collaboratively with two previously proposed preemption techniques for GPUs, namely context switching and draining to minimize throughput overhead while achieving a required preemption latency. Evaluations show that Chimera violates the deadline for only 0.2\% of preemption requests when a 15us preemption latency constraint is used. For multi-programmed workloads, Chimera can improve the average normalized turnaround time by 5.5x, and system throughput by 12.2\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Agarwal:2015:PPS, author = "Neha Agarwal and David Nellans and Mark Stephenson and Mike O'Connor and Stephen W. Keckler", title = "Page Placement Strategies for {GPUs} within Heterogeneous Memory Systems", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "607--618", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694381", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Systems from smartphones to supercomputers are increasingly heterogeneous, being composed of both CPUs and GPUs. To maximize cost and energy efficiency, these systems will increasingly use globally-addressable heterogeneous memory systems, making choices about memory page placement critical to performance. In this work we show that current page placement policies are not sufficient to maximize GPU performance in these heterogeneous memory systems. We propose two new page placement policies that improve GPU performance: one application agnostic and one using application profile information. Our application agnostic policy, bandwidth-aware (BW-AWARE) placement, maximizes GPU throughput by balancing page placement across the memories based on the aggregate memory bandwidth available in a system. Our simulation-based results show that BW-AWARE placement outperforms the existing Linux INTERLEAVE and LOCAL policies by 35\% and 18\% on average for GPU compute workloads. We build upon BW-AWARE placement by developing a compiler-based profiling mechanism that provides programmers with information about GPU application data structure access patterns. Combining this information with simple program-annotated hints about memory placement, our hint-based page placement approach performs within 90\% of oracular page placement on average, largely mitigating the need for costly dynamic page tracking and migration.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Zhao:2015:FPS, author = "Zhijia Zhao and Xipeng Shen", title = "On-the-Fly Principled Speculation for {FSM} Parallelization", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "619--630", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694369", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Finite State Machine (FSM) is the backbone of an important class of applications in many domains. Its parallelization has been extremely difficult due to inherent strong dependences in the computation. Recently, principled speculation shows good promise to solve the problem. However, the reliance on offline training makes the approach inconvenient to adopt and hard to apply to many practical FSM applications, which often deal with a large variety of inputs different from training inputs. This work presents an assembly of techniques that completely remove the needs for offline training. The techniques include a set of theoretical results on inherent properties of FSMs, and two newly designed dynamic optimizations for efficient FSM characterization. The new techniques, for the first time, make principle speculation applicable on the fly, and enables swift, automatic configuration of speculative parallelizations to best suit a given FSM and its current input. They eliminate the fundamental barrier for practical adoption of principle speculation for FSM parallelization. Experiments show that the new techniques give significantly higher speedups for some difficult FSM applications in the presence of input changes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{David:2015:ACS, author = "Tudor David and Rachid Guerraoui and Vasileios Trigonakis", title = "Asynchronized Concurrency: The Secret to Scaling Concurrent Search Data Structures", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "631--644", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694359", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We introduce ``asynchronized concurrency (ASCY),'' a paradigm consisting of four complementary programming patterns. ASCY calls for the design of concurrent search data structures (CSDSs) to resemble that of their sequential counterparts. We argue that ASCY leads to implementations which are portably scalable: they scale across different types of hardware platforms, including single and multi-socket ones, for various classes of workloads, such as read-only and read-write, and according to different performance metrics, including throughput, latency, and energy. We substantiate our thesis through the most exhaustive evaluation of CSDSs to date, involving 6 platforms, 22 state-of-the-art CSDS algorithms, 10 re-engineered state-of-the-art CSDS algorithms following the ASCY patterns, and 2 new CSDS algorithms designed with ASCY in mind. We observe up to 30\% improvements in throughput in the re-engineered algorithms, while our new algorithms out-perform the state-of-the-art alternatives.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Bhatotia:2015:ITL, author = "Pramod Bhatotia and Pedro Fonseca and Umut A. Acar and Bj{\"o}rn B. Brandenburg and Rodrigo Rodrigues", title = "{iThreads}: a Threading Library for Parallel Incremental Computation", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "645--659", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694371", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Incremental computation strives for efficient successive runs of applications by re-executing only those parts of the computation that are affected by a given input change instead of recomputing everything from scratch. To realize these benefits automatically, we describe iThreads, a threading library for parallel incremental computation. iThreads supports unmodified shared-memory multithreaded programs: it can be used as a replacement for pthreads by a simple exchange of dynamically linked libraries, without even recompiling the application code. To enable such an interface, we designed algorithms and an implementation to operate at the compiled binary code level by leveraging MMU-assisted memory access tracking and process-based thread isolation. Our evaluation on a multicore platform using applications from the PARSEC and Phoenix benchmarks and two case-studies shows significant performance gains.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Gidra:2015:NGC, author = "Lokesh Gidra and Ga{\"e}l Thomas and Julien Sopena and Marc Shapiro and Nhan Nguyen", title = "{NumaGiC}: a Garbage Collector for Big Data on Big {NUMA} Machines", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "661--673", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694361", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "On contemporary cache-coherent Non-Uniform Memory Access (ccNUMA) architectures, applications with a large memory footprint suffer from the cost of the garbage collector (GC), because, as the GC scans the reference graph, it makes many remote memory accesses, saturating the interconnect between memory nodes. We address this problem with NumaGiC, a GC with a mostly-distributed design. In order to maximise memory access locality during collection, a GC thread avoids accessing a different memory node, instead notifying a remote GC thread with a message; nonetheless, NumaGiC avoids the drawbacks of a pure distributed design, which tends to decrease parallelism. We compare NumaGiC with Parallel Scavenge and NAPS on two different ccNUMA architectures running on the Hotspot Java Virtual Machine of OpenJDK 7. On Spark and Neo4j, two industry-strength analytics applications, with heap sizes ranging from 160GB to 350GB, and on SPECjbb2013 and SPECjbb2005, ourgc improves overall performance by up to 45\% over NAPS (up to 94\% over Parallel Scavenge), and increases the performance of the collector itself by up to 3.6x over NAPS (up to 5.4x over Parallel Scavenge).", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Nguyen:2015:FCR, author = "Khanh Nguyen and Kai Wang and Yingyi Bu and Lu Fang and Jianfei Hu and Guoqing Xu", title = "{FACADE}: a Compiler and Runtime for (Almost) Object-Bounded Big Data Applications", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "675--690", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694345", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The past decade has witnessed the increasing demands on data-driven business intelligence that led to the proliferation of data-intensive applications. A managed object-oriented programming language such as Java is often the developer's choice for implementing such applications, due to its quick development cycle and rich community resource. While the use of such languages makes programming easier, their automated memory management comes at a cost. When the managed runtime meets Big Data, this cost is significantly magnified and becomes a scalability-prohibiting bottleneck. This paper presents a novel compiler framework, called Facade, that can generate highly-efficient data manipulation code by automatically transforming the data path of an existing Big Data application. The key treatment is that in the generated code, the number of runtime heap objects created for data types in each thread is (almost) statically bounded, leading to significantly reduced memory management cost and improved scalability. We have implemented Facade and used it to transform 7 common applications on 3 real-world, already well-optimized Big Data frameworks: GraphChi, Hyracks, and GPS. Our experimental results are very positive: the generated programs have (1) achieved a 3\%--48\% execution time reduction and an up to 88X GC reduction; (2) consumed up to 50\% less memory, and (3) scaled to much larger datasets.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Agrawal:2015:ASD, author = "Varun Agrawal and Abhiroop Dabral and Tapti Palit and Yongming Shen and Michael Ferdman", title = "Architectural Support for Dynamic Linking", journal = j-COMP-ARCH-NEWS, volume = "43", number = "1", pages = "691--702", month = mar, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2786763.2694392", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Jun 3 11:27:38 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "All software in use today relies on libraries, including standard libraries (e.g., C, C++) and application-specific libraries (e.g., libxml, libpng). Most libraries are loaded in memory and dynamically linked when programs are launched, resolving symbol addresses across the applications and libraries. Dynamic linking has many benefits: It allows code to be reused between applications, conserves memory (because only one copy of a library is kept in memory for all the applications that share it), and allows libraries to be patched and updated without modifying programs, among numerous other benefits. However, these benefits come at the cost of performance. For every call made to a function in a dynamically linked library, a trampoline is used to read the function address from a lookup table and branch to the function, incurring memory load and branch operations. Static linking avoids this performance penalty, but loses all the benefits of dynamic linking. Given its myriad benefits, dynamic linking is the predominant choice today, despite the performance cost. In this work, we propose a speculative hardware mechanism to optimize dynamic linking by avoiding executing the trampolines for library function calls, providing the benefits of dynamic linking with the performance of static linking. Speculatively skipping the memory load and branch operations of the library call trampolines improves performance by reducing the number of executed instructions and gains additional performance by reducing pressure on the instruction and data caches, TLBs, and branch predictors. Because the indirect targets of library call trampolines do not change during program execution, our speculative mechanism never misspeculates in practice. We evaluate our technique on real hardware with production software and observe up to 4\% speedup using only 1.5KB of on-chip storage.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'15 conference proceedings.", } @Article{Chien:2015:CSH, author = "Andrew A. Chien and Tung Thanh-Hoang and Dilip Vasudevan and Yuanwei Fang and Amirali Shambayati", title = "$ 10 \times 10 $: a Case Study in Highly-Programmable and Energy-Efficient Heterogeneous Federated Architecture", journal = j-COMP-ARCH-NEWS, volume = "43", number = "3", pages = "2--9", month = may, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2856113.2856115", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Dec 21 18:10:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Customized architecture is widely recognized as an important approach for improved performance and energy efficiency. To balance generality and customization benefit, researchers have proposed to federate heterogeneous micro-engines. Using the $ 10 \times 10 $ architecture and an integrated image and vision benchmark as a case study, we explore the performance and energy benefits achievable. Results for current 32nm technology and DDR3 memory show $ 10 \times 10 $ architecture benefits of 140$ \times $ performance and 72$ \times $ energy overall. Adding 3D-stacked DRAM increase benefits to 171$ \times $ (performance) and 100$ \times $ (energy). Finally, considering future 7nm transistor process, benefits as large as 597$ \times $ (performance) and 137$ \times $ energy are observed.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2015:INa, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "43", number = "3", pages = "10--16", month = may, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2856113.2856117", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Dec 21 18:10:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Herbordt:2015:LLG, author = "Martin Herbordt and Miriam Leeser", title = "Off-Loading {LET} Generation to {PEACH2}: a Switching Hub for High Performance {GPU} Clusters", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "3--8", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927966", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/bibnet/subjects/fastmultipole.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A hardware local essential tree (LET) generator used in an N-body simulation is implemented on the FPGA of PEACH2 (PCI Express Adaptive Communication Hub ver2), a low latency switching hub for high performance GPU clusters. By using the pipelined on-the-fly execution with a multipole acceptance criterion judging module and a data updating module, the generation performance is 2.2 times faster than that with the CPU. When data communication is considered, the performance was 7.2 times as the case with the CPU.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '15 conference proceedings.", } @Article{Okina:2015:PPP, author = "Koji Okina and Rie Soejima and Kota Fukumoto and Yuichiro Shibata and Kiyoshi Oguri", title = "Power Performance Profiling of {$3$-D} Stencil Computation on an {FPGA} Accelerator for Efficient Pipeline Optimization", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "9--14", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927967", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper discusses power-performance optimization for 3-D stencil computing on a stream-oriented FPGA accelerator with high-level synthesis. Taking a heat conduction simulation and an FDTD electromagnetic field simulation as benchmark applications, power-performance profiling results are presented focusing on the effect of high-level pipeline parameters. As a result, it is shown that the optimal power efficiency can be achieved basically by optimizing the execution performance. The relationship between power efficiency and the clock frequency is also discussed.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '15 conference proceedings.", } @Article{Lashgar:2015:CSR, author = "Ahmad Lashgar and Ebad Salehi and Amirali Baniasadi", title = "A Case Study in Reverse Engineering {GPGPUs}: Outstanding Memory Handling Resources", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "15--21", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927968", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "During recent years, GPU micro-architectures have changed dramatically, evolving into powerful many-core deep-multithreaded platforms for parallel workloads. While important micro-architectural modifications continue to appear in every new generation of these processors, unfortunately, little is known about the details of these innovative designs. One of the key questions in understanding GPUs is how they deal with outstanding memory misses. Our goal in this study is to find answers to this question. To this end, we develop a set of micro-benchmarks in CUDA to understand the outstanding memory requests handling resources. Particularly, we study two NVIDIA GPGPUs (Fermi and Kepler) and estimate their capability in handling outstanding memory requests. We show that Kepler can issue nearly 32X higher number of outstanding memory requests, compared to Fermi. We explain this enhancement by Kepler's architectural modifications in outstanding memory request handling resources.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '15 conference proceedings.", } @Article{Hayashi:2015:LRO, author = "Ami Hayashi and Yuta Tokusashi and Hiroki Matsutani", title = "A Line Rate Outlier Filtering {FPGA NIC} using {10GbE} Interface", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "22--27", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927969", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As data sets grow rapidly in size and the number, an outlier detection that filters unnecessary normal information becomes important. In this paper, we propose to move the unsupervised outlier detection from an application layer to a network interface card (NIC). Only anomalous items or events are received for a network protocol stack and the other packets are discarded at the NIC. The demands for storage and computation costs at a host are thus dramatically reduced. However, because normal items are discarded at the NIC and the application layer can no longer know what is normal, in our approach, the application at the host periodically peeks at the NIC buffer. We select an outlier detection based on the Mahalanobis distance as one of the simplest algorithms. Our approach is implemented on an FPGA-based NIC that has 10GbE interfaces. The sampling frequency of the NIC buffer vs. outlier detection precision is analyzed. Real experiments using the FPGA NIC demonstrate a 14,000,000 samples-per-second throughput in performance, which is close to the 10GbE line rate.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '15 conference proceedings.", } @Article{Jain:2015:ADA, author = "Abhishek Kumar Jain and Xiangwei Li and Suhaib A. Fahmy and Douglas L. Maskell", title = "Adapting the {DySER} Architecture with {DSP} Blocks as an Overlay for the {Xilinx Zynq}", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "28--33", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927970", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Coarse-grained overlay architectures have been shown to be effective when paired with general purpose processors, offering software-like programmability, fast compilation, and improved design productivity. These architectures enable general purpose hardware accelerators, allowing hardware design at a higher level of abstraction, but at the cost of area and performance overheads. This paper examines the DySER overlay architecture as a hardware accelerator paired with a general purpose processor in a hybrid FPGA such as the Xilinx Zynq. We evaluate the DySER architecture mapped on the Xilinx Zynq and show that it suffers from a significant area and performance overhead. We then propose an improved functional unit architecture using the flexibility of the DSP48E1 primitive which results in a 2.5 times frequency improvement and 25\% area reduction compared to the original functional unit architecture. We demonstrate that this improvement results in the routing architecture becoming the bottleneck in performance.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '15 conference proceedings.", } @Article{delaChevallerie:2015:FLH, author = "David de la Chevallerie and Jens Korinth and Andreas Koch", title = "{ffLink}: a Lightweight High-Performance Open-Source {PCI Express Gen3} Interface for Reconfigurable Accelerators", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "34--39", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927971", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We describe the architecture and implementation of ffLink, a high-performance PCIe Gen3 interface for attaching reconfigurable accelerators on Xilinx Virtex 7 FPGA devices to Linux-based hosts. ffLink encompasses both hardware as well as flexible operating system components that allow a tailoring of the infrastructure to the specific data transfer needs of the application. When configured to use multiple DMA engines to hide transfer latencies, ffLink achieves a throughput of up to 7 GB/s, which is 95\% of the maximum throughput of an eight-lane PCIe interface, while requiring just 11\% of device area on a mid-size FPGA.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '15 conference proceedings.", } @Article{Hmid:2015:TAR, author = "Soukaina N. Hmid and Jose G. F. Coutinho and Wayne Luk", title = "A Transfer-Aware Runtime System for Heterogeneous Asynchronous Parallel Execution", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "40--45", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927972", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents a novel resource management approach for efficiently managing the computation and the data movements between the host and its accelerators in a heterogeneous platform. Our approach is based on OmpSs, with support for multi-core CPUs, GPGPUs and Maxeler Data Flow Engines based on FPGA technology; it exploits data locality, data transfer costs and data dependencies. The proposed approach is supported by an offline learning process coupled with online monitoring, allowing performance to be estimated while learning from past observations during execution. Its performance is compared against the current OmpSs scheduler using five benchmarks: matrix multiplication, bitonic sort, N-body simulation, Cholesky decomposition and AdPredictor. The results show the proposed approach can achieve up to 4.25 times speed-up for Cholesky decomposition. Moreover, an evaluation with AdPredictor indicates that the FPGA version is up to 46 times faster than the CPU version for large task sizes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '15 conference proceedings.", } @Article{Al-Wattar:2015:EMA, author = "Ahmed Al-Wattar and Shawki Areibi and Gary Grewal", title = "Efficient Mapping and Allocation of Execution Units to Task Graphs using an Evolutionary Framework", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "46--51", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927973", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Partial dynamic reconfiguration of FPGAs gives designers the capability to change certain parts of the hardware while other parts remain active and in use. This provides several benefits including reducing device count and power consumption. However, this also introduces new challenges that need to be addressed by designers. This paper introduces a framework for efficient mapping of execution units to task graphs in a runtime reconfigurable system. The framework utilizes an Island Based Genetic Algorithm flow that optimizes several objectives including delay and power consumption. The GA based technique not only optimizes the above objectives, but also aggregates the Pareto front of the different islands to further enhance solution quality. The Island based GA runs each GA in parallel, and is amenable to both software and hardware implementation. The proposed Island based GA framework achieves on average 55.2\% improvement over a single GA implementation and 80.7\% improvement over a baseline random allocation and binding approach.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '15 conference proceedings.", } @Article{Momeni:2015:EEO, author = "Amir Momeni and Hamed Tabkhi and Yash Ukidave and Gunar Schirner and David Kaeli", title = "Exploring the Efficiency of the {OpenCL} Pipe Semantic on an {FPGA}", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "52--57", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927974", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper evaluates the potential benefits of leveraging the OpenCL Pipe semantic to accelerate FPGA-based applications. Our work focuses on streaming applications in the embedded vision processing domain. These applications are well-suited for concurrent kernel execution support and inter-kernel communication enabled by using OpenCL pipes. We analyze the impact of multiple design factors and application optimizations to improve the performance offered by OpenCL Pipes. The design tradeoffs considered include: the execution granularity across kernels, the rate and volume of data transfers, and the Pipe size. For our case study application of vision ow, we observe a 2.8X increase in throughput for tuned pipelined kernels, as compared to non-pipelined execution. In addition, we propose a novel mechanism to efficiently capture the behavior for 2-dimensional (2D) vision algorithms to benefit Pipe-based execution.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '15 conference proceedings.", } @Article{Mitsuishi:2015:BFS, author = "Takuji Mitsuishi and Jun Suzuki and Yuki Hayashi and Masaki Kan and Hideharu Amano", title = "Breadth First Search on Cost-efficient Multi-{GPU} Systems", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "58--63", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927975", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A parallel Breadth First Search (BFS) algorithm is proposed for cost-efficient multi-GPU systems without enough memory amount or communication performance. By using an improved data structure for the duplication elimination of local nodes, both required memory amount and processing time are reduced. By using Unified Virtual Addressing, time for communication can be hidden with the computation. The proposed algorithm is implemented on two cost-efficient multi-GPU systems: Express multi-GPU system which has a full of flexibility but the communication latency between GPU and host is limited, and a high-end gaming machine whose memory is limited. Both systems achieve good strong scaling with the proposed methods. On Express multi-GPU system, the communication overhead was almost completely hidden, and the aggregate communication throughput reached 4.77 GB/sec (38.16 Gbps), almost theoretical maximum.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '15 conference proceedings.", } @Article{Mefenza:2015:IBM, author = "Michael Mefenza and Nicolas Edwards and Christophe Bobda", title = "Interface Based Memory Synthesis of Image Processing Applications in {FPGA}", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "64--69", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927976", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Image processing applications are computationally intensive and data intensive and rely on memory elements (buffer, window, line buffer, shift register, and frame buffer) to store data flow dependencies between computing components in FPGA. Due to the limited availability of these resources, optimization of memory allocation and the implementation of efficient memory architectures are important issues. We present an interface, the Component Interconnect and Data Access (CIDA), and its implementation, based on interface automata formalism. We used that interface for modeling image processing applications and generating common memory elements. Based on the proposed model and information about the FPGA architecture, we also present an optimization model to achieve allocation memory requirements to embedded memories (Block RAM and Distributed RAM). Allocation results from realistic video systems on Xilinx Zynq FPGAs verify the correctness of the model and show that the proposed approach achieves appreciable reduction in block RAM usage.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '15 conference proceedings.", } @Article{Tong:2015:HTS, author = "Da Tong and Viktor Prasanna", title = "High Throughput Sketch Based Online Heavy Hitter Detection on {FPGA}", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "70--75", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927977", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In the context of networking, a heavy hitter is an entity in a data stream whose amount of activity (such as bandwidth consumption or number of connections) is higher than a given threshold. Detecting heavy hitters is a critical task for network management and security in the Internet and data centers. Data streams in modern network usually contain millions of entities, such as traffic flows or IP domains. It is challenging to detect heavy hitters at a high throughput while supporting such a large number of entities. I this work, we propose a high throughput online heavy hitter detector based on the Count-min sketch algorithm on FPGA. We propose a high throughput hash computation architecture, optimize the Count-min sketch for hardware-based heavy hitter detection and use forwarding to deal with data hazards. The post place-and-route results of our architecture on a state-of-the-art FPGA shows high throughput and scalability. Our architecture achieves a throughput of 114 Gbps while supporting a typical 1 M concurrent entities. It sustains 100+ Gbps throughput while supporting various number of concurrent entities, stream sizes and accuracy requirements. Our implementation demonstrates improved performance compared with other sketch acceleration techniques on various platforms using similar sketch configurations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '15 conference proceedings.", } @Article{Wang:2015:CAS, author = "Xinying Wang and Phillip H. Jones and Joseph Zambreno", title = "A Configurable Architecture for Sparse {$ L U $} Decomposition on Matrices with Arbitrary Patterns", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "76--81", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927978", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Sparse LU decomposition has been widely used to solve sparse linear systems of equations found in many scientific and engineering applications, such as circuit simulation, power system modeling and computer vision. However, it is considered a computationally expensive factorization tool. While parallel implementations have been explored to accelerate sparse LU decomposition, irregular sparsity patterns often limit their performance gains. Prior FPGA-based accelerators have been customized to domain-specific sparsity patterns of pre-ordered symmetric matrices. In this paper, we present an efficient architecture for sparse LU decomposition that supports both symmetric and asymmetric sparse matrices with arbitrary sparsity patterns. The control structure of our architecture parallelizes computation and pivoting operations. Also, on-chip resource utilization is configured based on properties of the matrices being processed. Our experimental results show a 1:6 to 14x speedup over an optimized software implementation for benchmarks containing a wide range of sparsity patterns.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '15 conference proceedings.", } @Article{Sano:2015:SCS, author = "Kentaro Sano and Fumiya Kono and Naohito Nakasato and Alexander Vazhenin and Stanislav Sedukhin", title = "Stream Computation of Shallow Water Equation Solver for {FPGA}-based {$1$D} Tsunami Simulation", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "82--87", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927979", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "MOST (Method Of Splitting Tsunami) is widely used to solve shallow water equations (SWEs) for forecasting tsunami generated by an earthquake. Toward development of a power-efficient and high-performance computing system for 2D tsunami simulation, we conduct feasibility study on stream computation of 1D SWE solver with FPGA.We analyze an original code and design a stream algorithm with techniques of kernel fusion, shift buffering for streamed stencil-data access, and cascading processing elements for a longer pipeline. We implement a deep pipeline with at most 744 stages of 4 SPEs on 28 nm Stratix V FPGA, which achieves 82.4 GFlop/s at 200 MHz.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '15 conference proceedings.", } @Article{Guo:2015:PGA, author = "Liucheng Guo and Andreea Ingrid Funie and David B. Thomas and Haohuan Fu and Wayne Luk", title = "Parallel Genetic Algorithms on Multiple {FPGAs}", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "86--93", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927980", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Genetic algorithms (GA) have been shown to be effective in the optimization of many large-scale real-world problems in a reasonable amount of time. Parallel GAs not only reduce the overall GA execution time, but also bring higher quality solutions due to parallel search in multiple parts of the solution space. This paper proposes a parallel GA system on hardware such as Field-Programmable-Gate-Arrays (FPGAs). Our approach targets multiple FPGAs by exploring different search areas of the same solution space with different behaviours. Each FPGA contains an optimised customisable GA which can be configured using run-time parameters, removing the need for expensive recompilation. This paper also explores adjustment of the migration gap, providing empirical guidance on good settings to users. Experiments on three problems show the high performance of our system, with a 30 times speedup achieved compared to a multi-core CPU-based implementation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '15 conference proceedings.", } @Article{Thorson:2015:INb, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "94--100", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927982", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '15 conference proceedings.", } @Article{Thorson:2015:INc, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "43", number = "5", pages = "7--11", month = dec, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2964792.2964794", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jul 12 16:17:49 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Asgharimoghaddam:2016:SPE, author = "Hadi Asgharimoghaddam and Nam Sung Kim", title = "{SpinWise}: a Practical Energy-Efficient Synchronization Technique for {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "44", number = "1", pages = "1--8", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2971331.2971333", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jul 12 16:17:49 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Spinning had been the classical way of implementing synchronization primitives (i.e., barriers, locks and conditions) in pthread library before the adoption of fast user space mutex (futex). Since spinning cores do not perform any useful work, it has been believed that futex is more energy efficient than spinning. In this paper, using commercial chip multi-processors (CMPs), first we provide deep insights on how the commercial CMP and operating system together reduce power consumption during spinning- and futex-based synchronization and analyze the duration of synchronization cycles for each implementation. Second, we analyze limitations of existing techniques that attempt to reduce power consumption of CMPs during synchronization. Finally, we propose a spinning-based energy-efficient synchronization technique dubbed SpinWise. We demonstrate that SpinWise can provide 22\% higher geometric mean energy efficiency than futex for a CMP running applications with many frequent and short synchronization events.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Olson:2016:PDW, author = "Lena E. Olson and Mark D. Hill", title = "Probabilistic Directed Writebacks for Exclusive Caches", journal = j-COMP-ARCH-NEWS, volume = "44", number = "1", pages = "9--18", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2971331.2971334", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jul 12 16:17:49 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Energy is an increasingly important consideration in memory system design. Caches improve energy efficiency by decreasing execution time and reducing the number of main memory accesses, but they suffer from known inefficiencies: the last-level cache (LLC) tends to have a high miss ratio while simultaneously storing many blocks that are never referenced. Because these blocks are not referenced before eviction, we can write them directly to memory rather than to the LLC. To do so, we must predict which blocks will not be referenced. Previous approaches rely on additional state at the LLC and/or extra communication. We show that by predicting working set size per program counter (PC), we can decide which blocks have low probability of being referenced. Our approach relies on the insight that it is possible to makes this prediction based solely on the address stream as seen by the level-one data cache (L1D), eliminating the need to store or communicate PC values between levels of the cache hierarchy. We require no modifications to the LLC. Our approach uses Flajolet and Martin's probabilistic counting to keep the state small: two additional bits per L1D block, with an additional 6KB prediction table. This approach yields a large reduction in number of LLC writebacks: 25\% fewer for SPEC on average, 80\% fewer for graph500, and 67\% fewer for an in-memory hash table.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Thorson:2016:INa, author = "Mark Thorson", title = "{Internet} Nuggets", journal = j-COMP-ARCH-NEWS, volume = "44", number = "1", pages = "19--22", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2971331.2971336", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jul 12 16:17:49 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zhou:2016:PUH, author = "Yuanyuan Zhou", title = "Programming Uncertain {$<$T$>$ hings}", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "1--2", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872416", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Innovation flourishes with good abstractions. For instance, codification of the IEEE Floating Point standard in 1985 was critical to the subsequent success of scientific computing. Programming languages currently lack appropriate abstractions for uncertain data. Applications already use estimates from sensors, machine learning, big data, humans, and approximate algorithms, but most programming languages do not help developers address correctness, programmability, and optimization problems due to estimates. To address these problems, we propose a new programming abstraction called Uncertain. We encourage the community to develop and use abstractions for estimates.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Abadal:2016:WAF, author = "Sergi Abadal and Albert Cabellos-Aparicio and Eduard Alarcon and Josep Torrellas", title = "{WiSync}: an Architecture for Fast Synchronization through On-Chip Wireless Communication", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "3--17", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872396", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In shared-memory multiprocessing, fine-grain synchronization is challenging because it requires frequent communication. As technology scaling delivers larger manycore chips, such pattern is expected to remain costly to support. In this paper, we propose to address this challenge by using on-chip wireless communication. Each core has a transceiver and an antenna to communicate with all the other cores. This environment supports very low latency global communication. Our architecture, called WiSync, uses a per-core Broadcast Memory (BM). When a core writes to its BM, all the other 100+ BMs get updated in less than 10 processor cycles. We also use a second wireless channel with cheaper transfers to execute barriers efficiently. WiSync supports multiprogramming, virtual memory, and context switching. Our evaluation with simulations of 128-threaded kernels and 64-threaded applications shows that WiSync speeds-up synchronization substantially. Compared to using advanced conventional synchronization, WiSync attains an average speedup of nearly one order of magnitude for the kernels, and 1.12 for PARSEC and SPLASH-2.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Wang:2016:RTE, author = "Xiaodong Wang and Jos{\'e} F. Mart{\'\i}nez", title = "{ReBudget}: Trading Off Efficiency vs. Fairness in Market-Based Multicore Resource Allocation via Runtime Budget Reassignment", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "19--32", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872382", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Efficiently allocating shared resources in computer systems is critical to optimizing execution. Recently, a number of market-based solutions have been proposed to attack this problem. Some of them provide provable theoretical bounds to efficiency and/or fairness losses under market equilibrium. However, they are limited to markets with potentially important constraints, such as enforcing equal budget for all players, or curve-fitting players' utility into a specific function type. Moreover, they do not generally provide an intuitive ``knob'' to control efficiency vs. fairness. In this paper, we introduce two new metrics, Market Utility Range (MUR) and Market Budget Range (MBR), through which we provide for the first time theoretical bounds on efficiency and fairness of market equilibria under arbitrary budget assignments. We leverage this result and propose ReBudget, an iterative budget re-assignment algorithm that can be used to control efficiency vs. fairness at run-time. We apply our algorithm to a multi-resource allocation problem in multicore chips. Our evaluation using detailed execution-driven simulations shows that our budget re-assignment technique is intuitive, effective, and efficient.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Zhu:2016:DEQ, author = "Haishan Zhu and Mattan Erez", title = "{Dirigent}: Enforcing {QoS} for Latency-Critical Tasks on Shared Multicore Systems", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "33--47", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872394", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Latency-critical applications suffer from both average performance degradation and reduced completion time predictability when collocated with batch tasks. Such variation forces the system to overprovision resources to ensure Quality of Service (QoS) for latency-critical tasks, degrading overall system throughput. We explore the causes of this variation and exploit the opportunities of mitigating variation directly to simultaneously improve both QoS and utilization. We develop, implement, and evaluate Dirigent, a lightweight performance-management runtime system that accurately controls the QoS of latency-critical applications at fine time scales, leveraging existing architecture mechanisms. We evaluate Dirigent on a real machine and show that it is significantly more effective than configurations representative of prior schemes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Kuperman:2016:PR, author = "Yossi Kuperman and Eyal Moscovici and Joel Nider and Razya Ladelsky and Abel Gordon and Dan Tsafrir", title = "Paravirtual Remote {I/O}", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "49--65", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872378", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The traditional ``trap and emulate'' I/O paravirtualization model conveniently allows for I/O interposition, yet it inherently incurs costly guest-host context switches. The newer ``sidecore'' model eliminates this overhead by dedicating host (side)cores to poll the relevant guest memory regions and react accordingly without context switching. But the dedication of sidecores on each host might be wasteful when I/O activity is low, or it might not provide enough computational power when I/O activity is high. We propose to alleviate this problem at rack scale by consolidating the dedicated sidecores spread across several hosts onto one server. The hypervisor is then effectively split into two parts: the local hypervisor that hosts the VMs, and the remote hypervisor that processes their paravirtual I/O. We call this model vRIO---paraVirtual Remote I/O. We find that by increasing the latency somewhat, it provides comparable throughput with fewer sidecores and superior throughput with the same number of sidecores as compared to the state of the art. vRIO additionally constitutes a new, cost-effective way to consolidate I/O devices (on the remote hypervisor) while supporting efficient programmable I/O interposition.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Kaufmann:2016:HPP, author = "Antoine Kaufmann and SImon Peter and Naveen Kr. Sharma and Thomas Anderson and Arvind Krishnamurthy", title = "High Performance Packet Processing with {FlexNIC}", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "67--81", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872367", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The recent surge of network I/O performance has put enormous pressure on memory and software I/O processing sub systems. We argue that the primary reason for high memory and processing overheads is the inefficient use of these resources by current commodity network interface cards (NICs). We propose FlexNIC, a flexible network DMA interface that can be used by operating systems and applications alike to reduce packet processing overheads. FlexNIC allows services to install packet processing rules into the NIC, which then executes simple operations on packets while exchanging them with host memory. Thus, our proposal moves some of the packet processing traditionally done in software to the NIC, where it can be done flexibly and at high speed. We quantify the potential benefits of FlexNIC by emulating the proposed FlexNIC functionality with existing hardware or in software. We show that significant gains in application performance are possible, in terms of both latency and throughput, for several widely used applications, including a key-value store, a stream processing system, and an intrusion detection system.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Bornholt:2016:SCF, author = "James Bornholt and Antoine Kaufmann and Jialin Li and Arvind Krishnamurthy and Emina Torlak and Xi Wang", title = "Specifying and Checking File System Crash-Consistency Models", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "83--98", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872406", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Applications depend on persistent storage to recover state after system crashes. But the POSIX file system interfaces do not define the possible outcomes of a crash. As a result, it is difficult for application writers to correctly understand the ordering of and dependencies between file system operations, which can lead to corrupt application state and, in the worst case, catastrophic data loss. This paper presents crash-consistency models, analogous to memory consistency models, which describe the behavior of a file system across crashes. Crash-consistency models include both litmus tests, which demonstrate allowed and forbidden behaviors, and axiomatic and operational specifications. We present a formal framework for developing crash-consistency models, and a toolkit, called Ferrite, for validating those models against real file system implementations. We develop a crash-consistency model for ext4, and use Ferrite to demonstrate unintuitive crash behaviors of the ext4 implementation. To demonstrate the utility of crash-consistency models to application writers, we use our models to prototype proof-of-concept verification and synthesis tools, as well as new library interfaces for crash-safe applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Prasad:2016:PMR, author = "Aravinda Prasad and K. Gopinath", title = "Prudent Memory Reclamation in Procrastination-Based Synchronization", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "99--112", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872405", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Procrastination is the fundamental technique used in synchronization mechanisms such as Read-Copy-Update (RCU) where writers, in order to synchronize with readers, defer the freeing of an object until there are no readers referring to the object. The synchronization mechanism determines when the deferred object is safe to reclaim and when it is actually reclaimed. Hence, such memory reclamations are completely oblivious of the memory allocator state. This induces poor memory allocator performance, for instance, when the reclamations are ill-timed. Furthermore, deferred objects provide hints about the future that inform memory regions that are about to be freed. Although useful, hints are not exploited as deferred objects are not visible to memory allocators. We introduce Prudence, a dynamic memory allocator, that is tightly integrated with the synchronization mechanism to ensure visibility of deferred objects to the memory allocator. Such an integration enables Prudence to (i) identify the safe time to reclaim deferred objects' memory, (ii) have an inclusive view of the allocated, free and about-to-be-freed objects, and (iii) exploit optimizations based on the hints about the future during important state transitions. Our evaluation in the Linux kernel shows that Prudence integrated with RCU performs 3.9X to 28X better in micro-benchmarks compared to SLUB, a recent memory allocator in the Linux kernel. It also improves the overall performance perceptibly (4\%-18\%) for a mix of widely used synthetic and application benchmarks. Further, it performs better (up to 98\%) in terms of object hits in caches, object cache churns, slab churns, peak memory usage and total fragmentation, when compared with the SLUB allocator.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Mukkara:2016:WID, author = "Anurag Mukkara and Nathan Beckmann and Daniel Sanchez", title = "{Whirlpool}: Improving Dynamic Cache Management with Static Data Classification", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "113--127", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872363", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Cache hierarchies are increasingly non-uniform and difficult to manage. Several techniques, such as scratchpads or reuse hints, use static information about how programs access data to manage the memory hierarchy. Static techniques are effective on regular programs, but because they set fixed policies, they are vulnerable to changes in program behavior or available cache space. Instead, most systems rely on dynamic caching policies that adapt to observed program behavior. Unfortunately, dynamic policies spend significant resources trying to learn how programs use memory, and yet they often perform worse than a static policy. We present Whirlpool, a novel approach that combines static information with dynamic policies to reap the benefits of each. Whirlpool statically classifies data into pools based on how the program uses memory. Whirlpool then uses dynamic policies to tune the cache to each pool. Hence, rather than setting policies statically, Whirlpool uses static analysis to guide dynamic policies. We present both an API that lets programmers specify pools manually and a profiling tool that discovers pools automatically in unmodified binaries. We evaluate Whirlpool on a state-of-the-art NUCA cache. Whirlpool significantly outperforms prior approaches: on sequential programs, Whirlpool improves performance by up to 38\% and reduces data movement energy by up to 53\%; on parallel programs, Whirlpool improves performance by up to 67\% and reduces data movement energy by up to 2.6x.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Jeon:2016:TTD, author = "Myeongjae Jeon and Yuxiong He and Hwanju Kim and Sameh Elnikety and Scott Rixner and Alan L. Cox", title = "{TPC}: Target-Driven Parallelism Combining Prediction and Correction to Reduce Tail Latency in Interactive Services", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "129--141", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872370", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In interactive services such as web search, recommendations, games and finance, reducing the tail latency is crucial to provide fast response to every user. Using web search as a driving example, we systematically characterize interactive workload to identify the opportunities and challenges for reducing tail latency. We find that the workload consists of mainly short requests that do not benefit from parallelism, and a few long requests which significantly impact the tail but exhibit high parallelism speedup. This motivates estimating request execution time, using a predictor, to identify long requests and to parallelize them. Prediction, however, is not perfect; a long request mispredicted as short is likely to contribute to the server tail latency, setting a ceiling on the achievable tail latency. We propose TPC, an approach that combines prediction information judiciously with dynamic correction for inaccurate prediction. Dynamic correction increases parallelism to accelerate a long request that is mispredicted as short. TPC carefully selects the appropriate target latencies based on system load and parallelism efficiency to reduce tail latency. We implement TPC and several prior approaches to compare them experimentally on a single search server and on a cluster of 40 search servers. The experimental results show that TPC reduces the 99th- and 99.9th-percentile latency by up to 40\% compared with the best prior work. Moreover, we evaluate TPC on a finance server, demonstrating its effectiveness on reducing tail latency of interactive services beyond web search.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Brown:2016:HBS, author = "Fraser Brown and Andres N{\"o}tzli and Dawson Engler", title = "How to Build Static Checking Systems Using Orders of Magnitude Less Code", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "143--157", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872364", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Modern static bug finding tools are complex. They typically consist of hundreds of thousands of lines of code, and most of them are wedded to one language (or even one compiler). This complexity makes the systems hard to understand, hard to debug, and hard to retarget to new languages, thereby dramatically limiting their scope. This paper reduces checking system complexity by addressing a fundamental assumption, the assumption that checkers must depend on a full-blown language specification and compiler front end. Instead, our program checkers are based on drastically incomplete language grammars (``micro-grammars'') that describe only portions of a language relevant to a checker. As a result, our implementation is tiny-roughly 2500 lines of code, about two orders of magnitude smaller than a typical system. We hope that this dramatic increase in simplicity will allow people to use more checkers on more systems in more languages. We implement our approach in $ \mu $ chex, a language-agnostic framework for writing static bug checkers. We use it to build micro-grammar based checkers for six languages (C, the C preprocessor, C++, Java, JavaScript, and Dart) and find over 700 errors in real-world projects.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Zhang:2016:TED, author = "Tong Zhang and Dongyoon Lee and Changhee Jung", title = "{TxRace}: Efficient Data Race Detection Using Commodity Hardware Transactional Memory", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "159--173", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872384", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Detecting data races is important for debugging shared-memory multithreaded programs, but the high runtime overhead prevents the wide use of dynamic data race detectors. This paper presents TxRace, a new software data race detector that leverages commodity hardware transactional memory (HTM) to speed up data race detection. TxRace instruments a multithreaded program to transform synchronization-free regions into transactions, and exploits the conflict detection mechanism of HTM for lightweight data race detection at runtime. However, the limitations of the current best-effort commodity HTMs expose several challenges in using them for data race detection: (1) lack of ability to pinpoint racy instructions, (2) false positives caused by cache line granularity of conflict detection, and (3) transactional aborts for non-conflict reasons (e.g., capacity or unknown). To overcome these challenges, TxRace performs lightweight HTM-based data race detection at first, and occasionally switches to slow yet precise data race detection only for the small fraction of execution intervals in which potential races are reported by HTM. According to the experimental results, TxRace reduces the average runtime overhead of dynamic data race detection from 11.68x to 4.65x with only a small number of false negatives.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Amani:2016:CVH, author = "Sidney Amani and Alex Hixon and Zilin Chen and Christine Rizkallah and Peter Chubb and Liam O'Connor and Joel Beeren and Yutaka Nagashima and Japheth Lim and Thomas Sewell and Joseph Tuong and Gabriele Keller and Toby Murray and Gerwin Klein and Gernot Heiser", title = "{Cogent}: Verifying High-Assurance File System Implementations", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "175--188", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872404", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We present an approach to writing and formally verifying high-assurance file-system code in a restricted language called Cogent, supported by a certifying compiler that produces C code, high-level specification of Cogent, and translation correctness proofs. The language is strongly typed and guarantees absence of a number of common file system implementation errors. We show how verification effort is drastically reduced for proving higher-level properties of the file system implementation by reasoning about the generated formal specification rather than its low-level C code. We use the framework to write two Linux file systems, and compare their performance with their native C implementations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Asmussen:2016:MHO, author = "Nils Asmussen and Marcus V{\"o}lp and Benedikt N{\"o}then and Hermann H{\"a}rtig and Gerhard Fettweis", title = "{M3}: a Hardware\slash Operating-System Co-Design to Tame Heterogeneous Manycores", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "189--203", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872371", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In the last decade, the number of available cores increased and heterogeneity grew. In this work, we ask the question whether the design of the current operating systems (OSes) is still appropriate if these trends continue and lead to abundantly available but heterogeneous cores, or whether it forces a fundamental rethinking of how systems are designed. We argue that: 1. hiding heterogeneity behind a common hardware interface unifies, to a large extent, the control and coordination of cores and accelerators in the OS, 2. isolating at the network-on-chip rather than with processor features (like privileged mode, memory management unit, ...), allows running untrusted code on arbitrary cores, and 3. providing OS services via protocols over the network-on-chip, instead of via system calls, makes them accessible to arbitrary types of cores as well. In summary, this turns accelerators into first-class citizens and enables a single and convenient programming environment for all cores without the need to trust any application. In this paper, we introduce network-on-chip-level isolation, present the design of our microkernel-based OS, M3, and the common hardware interface, and evaluate the performance of our prototype in comparison to Linux. A bit surprising, without using accelerators, M3 outperforms Linux in some application-level benchmarks by more than a factor of five.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Liaqat:2016:SEE, author = "Daniyal Liaqat and Silviu Jingoi and Eyal de Lara and Ashvin Goel and Wilson To and Kevin Lee and Italo {De Moraes Garcia} and Manuel Saldana", title = "Sidewinder: an Energy Efficient and Developer Friendly Heterogeneous Architecture for Continuous Mobile Sensing", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "205--215", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872398", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Applications that perform continuous sensing on mobile phones have the potential to revolutionize everyday life. Examples range from medical and health monitoring applications, such as pedometers and fall detectors, to participatory sensing applications, such as noise pollution, traffic and seismic activity monitoring. Unfortunately, current mobile devices are a poor match for continuous sensing applications as they require the device to remain awake for extended periods of time, resulting in poor battery life. This paper presents Sidewinder, a new approach towards offloading sensor data processing to a low-power processor and waking up the main processor when events of interest occur. This approach differs from other heterogeneous architectures in that developers are presented with a programming interface that lets them construct application specific wake-up conditions by linking together and parameterizing predefined sensor data processing algorithms. Our experiments indicate performance that is comparable to approaches that provide fully programmable offloading, but do so with a much simpler programming interface that facilitates deployment and portability.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Balkind:2016:OOS, author = "Jonathan Balkind and Michael McKeown and Yaosheng Fu and Tri Nguyen and Yanqi Zhou and Alexey Lavrov and Mohammad Shahrad and Adi Fuchs and Samuel Payne and Xiaohua Liang and Matthew Matl and David Wentzlaff", title = "{OpenPiton}: an Open Source Manycore Research Framework", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "217--232", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872414", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Industry is building larger, more complex, manycore processors on the back of strong institutional knowledge, but academic projects face difficulties in replicating that scale. To alleviate these difficulties and to develop and share knowledge, the community needs open architecture frameworks for simulation, synthesis, and software exploration which support extensibility, scalability, and configurability, alongside an established base of verification tools and supported software. In this paper we present OpenPiton, an open source framework for building scalable architecture research prototypes from 1 core to 500 million cores. OpenPiton is the world's first open source, general-purpose, multithreaded manycore processor and framework. OpenPiton leverages the industry hardened OpenSPARC T1 core with modifications and builds upon it with a scratch-built, scalable uncore creating a flexible, modern manycore design. In addition, OpenPiton provides synthesis and backend scripts for ASIC and FPGA to enable other researchers to bring their designs to implementation. OpenPiton provides a complete verification infrastructure of over 8000 tests, is supported by mature software tools, runs full-stack multiuser Debian Linux, and is written in industry standard Verilog. Multiple implementations of OpenPiton have been created including a taped-out 25-core implementation in IBM's 32nm process and multiple Xilinx FPGA prototypes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Lustig:2016:CVM, author = "Daniel Lustig and Geet Sethi and Margaret Martonosi and Abhishek Bhattacharjee", title = "{COATCheck}: Verifying Memory Ordering at the Hardware-OS Interface", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "233--247", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872399", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Modern computer systems include numerous compute elements, from CPUs to GPUs to accelerators. Harnessing their full potential requires well-defined, properly-implemented memory consistency models (MCMs), and low-level system functionality such as virtual memory and address translation (AT). Unfortunately, it is difficult to specify and implement hardware-OS interactions correctly; in the past, many hardware and OS specification mismatches have resulted in implementation bugs in commercial processors. In an effort to resolve this verification gap, this paper makes the following contributions. First, we present COATCheck, an address translation-aware framework for specifying and statically verifying memory ordering enforcement at the microarchitecture and operating system levels. We develop a domain-specific language for specifying ordering enforcement, for including ordering-related OS events and hardware micro-operations, and for programmatically enumerating happens-before graphs. Using a fast and automated static constraint solver, COATCheck can efficiently analyze interesting and important memory ordering scenarios for modern, high-performance, out-of-order processors. Second, we show that previous work on Virtual Address Memory Consistency (VAMC) does not capture every translation-related ordering scenario of interest, and that some such cases even fall outside the traditional scope of consistency. We therefore introduce the term transistency model to describe the superset of consistency which captures all translation-aware sets of ordering rules.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Markuze:2016:TIP, author = "Alex Markuze and Adam Morrison and Dan Tsafrir", title = "True {IOMMU} Protection from {DMA} Attacks: When Copy is Faster than Zero Copy", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "249--262", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872379", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Malicious I/O devices might compromise the OS using DMAs. The OS therefore utilizes the IOMMU to map and unmap every target buffer right before and after its DMA is processed, thereby restricting DMAs to their designated locations. This usage model, however, is not truly secure for two reasons: (1) it provides protection at page granularity only, whereas DMA buffers can reside on the same page as other data; and (2) it delays DMA buffer unmaps due to performance considerations, creating a vulnerability window in which devices can access in-use memory. We propose that OSes utilize the IOMMU differently, in a manner that eliminates these two flaws. Our new usage model restricts device access to a set of shadow DMA buffers that are never unmapped, and it copies DMAed data to/from these buffers, thus providing sub-page protection while eliminating the aforementioned vulnerability window. Our key insight is that the cost of interacting with, and synchronizing access to the slow IOMMU hardware---required for zero-copy protection against devices---make copying preferable to zero-copying. We implement our model in Linux and evaluate it with standard networking benchmarks utilizing a 40,Gb/s NIC. We demonstrate that despite being more secure than the safest preexisting usage model, our approach provides up to 5x higher throughput. Additionally, whereas it is inherently less scalable than an IOMMU-less (unprotected) system, our approach incurs only 0\%--25\% performance degradation in comparison.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Awad:2016:SSZ, author = "Amro Awad and Pratyusa Manadhata and Stuart Haber and Yan Solihin and William Horne", title = "{Silent Shredder}: Zero-Cost Shredding for Secure Non-Volatile Main Memory Controllers", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "263--276", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872377", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As non-volatile memory (NVM) technologies are expected to replace DRAM in the near future, new challenges have emerged. For example, NVMs have slow and power-consuming writes, and limited write endurance. In addition, NVMs have a data remanence vulnerability, i.e., they retain data for a long time after being powered off. NVM encryption alleviates the vulnerability, but exacerbates the limited endurance by increasing the number of writes to memory. We observe that, in current systems, a large percentage of main memory writes result from data shredding in operating systems, a process of zeroing out physical pages before mapping them to new processes, in order to protect previous processes' data. In this paper, we propose Silent Shredder, which repurposes initialization vectors used in standard counter mode encryption to completely eliminate the data shredding writes. Silent Shredder also speeds up reading shredded cache lines, and hence reduces power consumption and improves overall performance. To evaluate our design, we run three PowerGraph applications and 26 multi-programmed workloads from the SPEC 2006 suite, on a gem5-based full system simulator. Silent Shredder eliminates an average of 48.6\% of the writes in the initialization and graph construction phases. It speeds up main memory reads by 3.3 times, and improves the number of instructions per cycle (IPC) by 6.4\% on average. Finally, we discuss several use cases, including virtual machines' data isolation and user-level large data initialization, where Silent Shredder can be used effectively at no extra cost.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Kwon:2016:SPT, author = "Youngjin Kwon and Alan M. Dunn and Michael Z. Lee and Owen S. Hofmann and Yuanzhong Xu and Emmett Witchel", title = "{Sego}: Pervasive Trusted Metadata for Efficiently Verified Untrusted System Services", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "277--290", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872372", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Sego is a hypervisor-based system that gives strong privacy and integrity guarantees to trusted applications, even when the guest operating system is compromised or hostile. Sego verifies operating system services, like the file system, instead of replacing them. By associating trusted metadata with user data across all system devices, Sego verifies system services more efficiently than previous systems, especially services that depend on data contents. We extensively evaluate Sego's performance on real workloads and implement a kernel fault injector to validate Sego's file system-agnostic crash consistency and recovery protocol.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Tsafrir:2016:SAW, author = "Dan Tsafrir", title = "Synopsis of the {ASPLOS '16 Wild and Crazy Ideas (WACI)} Invited-Speakers Session", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "291--294", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2876512", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The Wild and Crazy Ideas (WACI) session is a longstanding tradition at ASPLOS, soliciting talks that consist of forward-looking, visionary, inspiring, creative, far out or just plain amazing ideas presented in an exciting way. (Amusing elements in the presentations are tolerated ;-) but are in fact optional.) The first WACI session took place in 1998. Back then, the call for talks included a problem statement, which contended that ``papers usually do not get admitted to [such conferences as] ISCA or ASPLOS unless the systems that they describe are mature enough to run [some standard benchmark suites, which] has a chilling effect on the idea generation process --- encouraging incremental research'' [1]. The 1998 WACI session turned out to be a great success. Its webpage states that ``there were 42 submissions [competing over] only eight time slots, [which resulted in] this session [having] a lower acceptance rate than the conference itself'' [2]. But the times they are a-changin' [3], and the WACI session no longer enjoys that many submissions (Figure (1)), perhaps because nowadays there exist many forums for researchers to describe/discuss their preliminary ideas, including: the ``hot topics in'' workshops [4--7]; a journal like CAL, dedicated to early results [8]; main conferences soliciting short submissions describing ``original or unconventional ideas at a preliminary stage'' in addition to regular papers [9]; and the many workshops co-located with main conferences, like ISCA '15, which hosted thirteen such workshops [10]. Regardless of the reason for the declining number of submissions, this time we've decided to organize the WACI session differently to ensure its continued high quality. Instead of soliciting talks via an open call and hoping for the best, we proactively invited speakers whom we believe are capable of delivering excellent WACI presentations. That is, this year's WACI session consists exclusively of invited speakers. Filling up the available slots turned out to be fairly easy, as most of the researchers we invited promptly accepted our invitation. The duration of each talk was set to be eight minutes (exactly as in the first WACI session from 1998) plus two minutes for questions. The talks are outlined below. We believe they are interesting and exciting, and we hope the attendees of the session will find them stimulating and insightful.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Williams:2016:BIC, author = "R. Stanley Williams", title = "Brain Inspired Computing", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "295--295", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872417", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Phothilimthana:2016:SS, author = "Phitchaya Mangpo Phothilimthana and Aditya Thakur and Rastislav Bodik and Dinakar Dhurjati", title = "Scaling up Superoptimization", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "297--310", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872387", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Developing a code optimizer is challenging, especially for new, idiosyncratic ISAs. Superoptimization can, in principle, discover machine-specific optimizations automatically by searching the space of all instruction sequences. If we can increase the size of code fragments a superoptimizer can optimize, we will be able to discover more optimizations. We develop LENS, a search algorithm that increases the size of code a superoptimizer can synthesize by rapidly pruning away invalid candidate programs. Pruning is achieved by selectively refining the abstraction under which candidates are considered equivalent, only in the promising part of the candidate space. LENS also uses a bidirectional search strategy to prune the candidate space from both forward and backward directions. These pruning strategies allow LENS to solve twice as many benchmarks as existing enumerative search algorithms, while LENS is about 11-times faster. Additionally, we increase the effective size of the superoptimized fragments by relaxing the correctness condition using contexts (surrounding code). Finally, we combine LENS with complementary search techniques into a cooperative superoptimizer, which exploits the stochastic search to make random jumps in a large candidate space, and a symbolic (SAT-solver-based) search to synthesize arbitrary constants. While existing superoptimizers consistently solve 9--16 out of 32 benchmarks, the cooperative superoptimizer solves 29 benchmarks. It can synthesize code fragments that are up to 82\% faster than code generated by gcc -O3 from WiBench and MiBench.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Hasabnis:2016:LAI, author = "Niranjan Hasabnis and R. Sekar", title = "Lifting Assembly to Intermediate Representation: a Novel Approach Leveraging Compilers", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "311--324", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872380", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Translating low-level machine instructions into higher-level intermediate language (IL) is one of the central steps in many binary analysis and instrumentation systems. Existing systems build such translators manually. As a result, it takes a great deal of effort to support new architectures. Even for widely deployed architectures, full instruction sets may not be modeled, e.g., mature systems such as Valgrind still lack support for AVX, FMA4 and SSE4.1 for x86 processors. To overcome these difficulties, we propose a novel approach that leverages knowledge about instruction set semantics that is already embedded into modern compilers such as GCC. In particular, we present a learning-based approach for automating the translation of assembly instructions to a compiler's architecture-neutral IL. We present an experimental evaluation that demonstrates the ability of our approach to easily support many architectures (x86, ARM and AVR), including their advanced instruction sets. Our implementation is available as open-source software.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Muralidharan:2016:AAC, author = "Saurav Muralidharan and Amit Roy and Mary Hall and Michael Garland and Piyush Rai", title = "Architecture-Adaptive Code Variant Tuning", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "325--338", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872411", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Code variants represent alternative implementations of a computation, and are common in high-performance libraries and applications to facilitate selecting the most appropriate implementation for a specific execution context (target architecture and input dataset). Automating code variant selection typically relies on machine learning to construct a model during an offline learning phase that can be quickly queried at runtime once the execution context is known. In this paper, we define a new approach called architecture-adaptive code variant tuning, where the variant selection model is learned on a set of source architectures, and then used to predict variants on a new target architecture without having to repeat the training process. We pose this as a multi-task learning problem, where each source architecture corresponds to a task; we use device features in the construction of the variant selection model. This work explores the effectiveness of multi-task learning and the impact of different strategies for device feature selection. We evaluate our approach on a set of benchmarks and a collection of six NVIDIA GPU architectures from three distinct generations. We achieve performance results that are mostly comparable to the previous approach of tuning for a single GPU architecture without having to repeat the learning phase.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Lin:2016:SKT, author = "Xiaofeng Lin and Yu Chen and Xiaodong Li and Junjie Mao and Jiaquan He and Wei Xu and Yuanchun Shi", title = "Scalable Kernel {TCP} Design and Implementation for Short-Lived Connections", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "339--352", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872391", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With the rapid growth of network bandwidth, increases in CPU cores on a single machine, and application API models demanding more short-lived connections, a scalable TCP stack is performance-critical. Although many clean-state designs have been proposed, production environments still call for a bottom-up parallel TCP stack design that is backward-compatible with existing applications. We present Fastsocket, a BSD Socket-compatible and scalable kernel socket design, which achieves table-level connection partition in TCP stack and guarantees connection locality for both passive and active connections. Fastsocket architecture is a ground up partition design, from NIC interrupts all the way up to applications, which naturally eliminates various lock contentions in the entire stack. Moreover, Fastsocket maintains the full functionality of the kernel TCP stack and BSD-socket-compatible API, and thus applications need no modifications. Our evaluations show that Fastsocket achieves a speedup of 20.4x on a 24-core machine under a workload of short-lived connections, outperforming the state-of-the-art Linux kernel TCP implementations. When scaling up to 24 CPU cores, Fastsocket increases the throughput of Nginx and HAProxy by 267\% and 621\% respectively compared with the base Linux kernel. We also demonstrate that Fastsocket can achieve scalability and preserve BSD socket API at the same time. Fastsocket is already deployed in the production environment of Sina WeiBo, serving 50 million daily active users and billions of requests per day.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Hajj:2016:SPM, author = "Izzat {El Hajj} and Alexander Merritt and Gerd Zellweger and Dejan Milojicic and Reto Achermann and Paolo Faraboschi and Wen-mei Hwu and Timothy Roscoe and Karsten Schwan", title = "{SpaceJMP}: Programming with Multiple Virtual Address Spaces", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "353--368", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872366", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Memory-centric computing demands careful organization of the virtual address space, but traditional methods for doing so are inflexible and inefficient. If an application wishes to address larger physical memory than virtual address bits allow, if it wishes to maintain pointer-based data structures beyond process lifetimes, or if it wishes to share large amounts of memory across simultaneously executing processes, legacy interfaces for managing the address space are cumbersome and often incur excessive overheads. We propose a new operating system design that promotes virtual address spaces to first-class citizens, enabling process threads to attach to, detach from, and switch between multiple virtual address spaces. Our work enables data-centric applications to utilize vast physical memory beyond the virtual range, represent persistent pointer-rich data structures without special pointer representations, and share large amounts of memory between processes efficiently. We describe our prototype implementations in the DragonFly BSD and Barrelfish operating systems. We also present programming semantics and a compiler transformation to detect unsafe pointer usage. We demonstrate the benefits of our work on data-intensive applications such as the GUPS benchmark, the SAMTools genomics workflow, and the Redis key-value store.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Lin:2016:MTP, author = "Felix Xiaozhu Lin and Xu Liu", title = "{{\ttf memif}}: Towards Programming Heterogeneous Memory Asynchronously", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "369--383", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872401", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "To harness a heterogeneous memory hierarchy, it is advantageous to integrate application knowledge in guiding frequent memory move, i.e., replicating or migrating virtual memory regions. To this end, we present memif, a protected OS service for asynchronous, hardware-accelerated memory move. Compared to the state of the art --- page migration in Linux, memif incurs low overhead and low latency; in order to do so, it not only redefines the semantics of kernel interface but also overhauls the underlying mechanisms, including request/completion management, race handling, and DMA engine configuration. We implement memif in Linux for a server-class system-on-chip that features heterogeneous memories. Compared to the current Linux page migration, memif reduces CPU usage by up to 15\% for small pages and by up to 38x for large pages; in continuously serving requests, memif has no need for request batching and reduces latency by up to 63\%. By crafting a small runtime atop memif, we improve the throughputs for a set of streaming workloads by up to 33\%. Overall, memif has opened the door to software management of heterogeneous memory.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Kim:2016:NEN, author = "Wook-Hee Kim and Jinwoong Kim and Woongki Baek and Beomseok Nam and Youjip Won", title = "{NVWAL}: Exploiting {NVRAM} in Write-Ahead Logging", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "385--398", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872392", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Emerging byte-addressable non-volatile memory is considered an alternative storage device for database logs that require persistency and high performance. In this work, we develop NVWAL (NVRAM Write-Ahead Logging) for SQLite. The contribution of NVWAL consists of three elements: (i) byte-granularity differential logging that effectively eliminates the excessive I/O overhead of filesystem-based logging or journaling, (ii) transaction-aware lazy synchronization that reduces cache synchronization overhead by two-thirds, and (iii) user-level heap management of the NVRAM persistent WAL structure, which reduces the overhead of managing persistent objects. We implemented NVWAL in SQLite and measured the performance on a Nexus 5 smartphone and an NVRAM emulation board --- Tuna. Our performance study shows the following: (i) the overhead of enforcing strict ordering of NVRAM writes can be reduced via NVRAM-aware transaction management. (ii) From the application performance point of view, the overhead of guaranteeing failure atomicity is negligible; the cache line flush overhead accounts for only 0.8~4.6\% of transaction execution time. Therefore, application performance is much less sensitive to the NVRAM performance than we expected. Decreasing the NVRAM latency by one-fifth (from 1942 nsec to 437 nsec), SQLite achieves a mere 4\% performance gain (from 2517 ins/sec to 2621 ins/sec). (iii) Overall, when the write latency of NVRAM is 2 usec, NVWAL increases SQLite performance by at least 10x compared to that of WAL on flash memory (from 541 ins/sec to 5812 ins/sec).", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Kolli:2016:HPT, author = "Aasheesh Kolli and Steven Pelley and Ali Saidi and Peter M. Chen and Thomas F. Wenisch", title = "High-Performance Transactions for Persistent Memories", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "399--411", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872381", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Emerging non-volatile memory (NVRAM) technologies offer the durability of disk with the byte-addressability of DRAM. These devices will allow software to access persistent data structures directly in NVRAM using processor loads and stores, however, ensuring consistency of persistent data across power failures and crashes is difficult. Atomic, durable transactions are a widely used abstraction to enforce such consistency. Implementing transactions on NVRAM requires the ability to constrain the order of NVRAM writes, for example, to ensure that a transaction's log record is complete before it is marked committed. Since NVRAM write latencies are expected to be high, minimizing these ordering constraints is critical for achieving high performance. Recent work has proposed programming interfaces to express NVRAM write ordering constraints to hardware so that NVRAM writes may be coalesced and reordered while preserving necessary constraints. Unfortunately, a straightforward implementation of transactions under these interfaces imposes unnecessary constraints. We show how to remove these dependencies through a variety of techniques, notably, deferring commit until after locks are released. We present a comprehensive analysis contrasting two transaction designs across three NVRAM programming interfaces, demonstrating up to 2.5x speedup.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Guo:2016:HDI, author = "Qing Guo and Karin Strauss and Luis Ceze and Henrique S. Malvar", title = "High-Density Image Storage Using Approximate Memory Cells", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "413--426", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872413", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper proposes tailoring image encoding for an approximate storage substrate. We demonstrate that indiscriminately storing encoded images in approximate memory generates unacceptable and uncontrollable quality degradation. The key finding is that errors in the encoded bit streams have non-uniform impact on the decoded image quality. We develop a methodology to determine the relative importance of encoded bits and store them in an approximate storage substrate. The storage cells are optimized to reduce error rate via biasing and are tuned to meet the desired reliability requirement via selective error correction. In a case study with the progressive transform codec (PTC), a precursor to JPEG XR, the proposed approximate image storage system exhibits a 2.7x increase in density of pixels per silicon volume under bounded error rates, and this achievement is additive to the storage savings of PTC compression.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Izraelevitz:2016:FAP, author = "Joseph Izraelevitz and Terence Kelly and Aasheesh Kolli", title = "Failure-Atomic Persistent Memory Updates via {JUSTDO} Logging", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "427--442", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872410", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Persistent memory invites applications to manipulate persistent data via load and store instructions. Because failures during updates may destroy transient data (e.g., in CPU registers), preserving data integrity in the presence of failures requires failure-atomic bundles of updates. Prior failure atomicity approaches for persistent memory entail overheads due to logging and CPU cache flushing. Persistent caches can eliminate the need for flushing, but conventional logging remains complex and memory intensive. We present the design and implementation of JUSTDO logging, a new failure atomicity mechanism that greatly reduces the memory footprint of logs, simplifies log management, and enables fast parallel recovery following failure. Crash-injection tests confirm that JUSTDO logging preserves application data integrity and performance evaluations show that it improves throughput 3x or more compared with a state-of-the-art alternative for a spectrum of data-intensive algorithms.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Han:2016:IMD, author = "Jaeung Han and Seungheun Jeon and Young-ri Choi and Jaehyuk Huh", title = "Interference Management for Distributed Parallel Applications in Consolidated Clusters", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "443--456", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872388", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Consolidating multiple applications on a system can improve the overall resource utilization of data center systems. However, such consolidation can adversely affect the performance of some applications due to interference caused by resource contention. Despite many prior studies on the interference effects in single-node systems, the interference behaviors of distributed parallel applications have not been investigated thoroughly. With distributed applications, a local interference in a node can affect the whole execution of an application spanning many nodes. This paper studies an interference modeling methodology for distributed applications to predict their performance under interference effects in consolidated clusters. This study first characterizes the effects of interference for various distributed applications over different interference settings, and analyzes how diverse interference intensities on multiple nodes affect the overall performance. Based on the characterization, this study proposes a static profiling-based model for interference propagation and heterogeneity behaviors. In addition, this paper presents use case studies of the modeling method, two interference-aware placement techniques for consolidated virtual clusters, which attempt to maximize the overall throughput or to guarantee the quality-of-service.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Maas:2016:THL, author = "Martin Maas and Krste Asanovi{\'c} and Tim Harris and John Kubiatowicz", title = "{Taurus}: a Holistic Language Runtime System for Coordinating Distributed Managed-Language Applications", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "457--471", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872386", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Many distributed workloads in today's data centers are written in managed languages such as Java or Ruby. Examples include big data frameworks such as Hadoop, data stores such as Cassandra or applications such as the SOLR search engine. These workloads typically run across many independent language runtime systems on different nodes. This setup represents a source of inefficiency, as these language runtime systems are unaware of each other. For example, they may perform Garbage Collection at times that are locally reasonable but not in a distributed setting. We address these problems by introducing the concept of a Holistic Runtime System that makes runtime-level decisions for the entire distributed application rather than locally. We then present Taurus, a Holistic Runtime System prototype. Taurus is a JVM drop-in replacement, requires almost no configuration and can run unmodified off-the-shelf Java applications. Taurus enforces user-defined coordination policies and provides a DSL for writing these policies. By applying Taurus to Garbage Collection, we demonstrate the potential of such a system and use it to explore coordination strategies for the runtime systems of real-world distributed applications, to improve application performance and address tail-latencies in latency-sensitive workloads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Delimitrou:2016:HRE, author = "Christina Delimitrou and Christos Kozyrakis", title = "{HCloud}: Resource-Efficient Provisioning in Shared Cloud Systems", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "473--488", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872365", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Cloud computing promises flexibility and high performance for users and cost efficiency for operators. To achieve this, cloud providers offer instances of different sizes, both as long-term reservations and short-term, on-demand allocations. Unfortunately, determining the best provisioning strategy is a complex, multi-dimensional problem that depends on the load fluctuation and duration of incoming jobs, and the performance unpredictability and cost of resources. We first compare the two main provisioning strategies (reserved and on-demand resources) on Google Compute Engine (GCE) using three representative workload scenarios with batch and latency-critical applications. We show that either approach is suboptimal for performance or cost. We then present HCloud, a hybrid provisioning system that uses both reserved and on-demand resources. HCloud determines which jobs should be mapped to reserved versus on-demand resources based on overall load, and resource unpredictability. It also determines the optimal instance size an application needs to satisfy its Quality of Service (QoS) constraints. We demonstrate that hybrid configurations improve performance by 2.1x compared to fully on-demand provisioning, and reduce cost by 46\% compared to fully reserved systems. We also show that hybrid strategies are robust to variation in system and job parameters, such as cost and system load.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Yu:2016:CWM, author = "Xiao Yu and Pallavi Joshi and Jianwu Xu and Guoliang Jin and Hui Zhang and Guofei Jiang", title = "{CloudSeer}: Workflow Monitoring of Cloud Infrastructures via Interleaved Logs", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "489--502", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872407", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Cloud infrastructures provide a rich set of management tasks that operate computing, storage, and networking resources in the cloud. Monitoring the executions of these tasks is crucial for cloud providers to promptly find and understand problems that compromise cloud availability. However, such monitoring is challenging because there are multiple distributed service components involved in the executions. CloudSeer enables effective workflow monitoring. It takes a lightweight non-intrusive approach that purely works on interleaved logs widely existing in cloud infrastructures. CloudSeer first builds an automaton for the workflow of each management task based on normal executions, and then it checks log messages against a set of automata for workflow divergences in a streaming manner. Divergences found during the checking process indicate potential execution problems, which may or may not be accompanied by error log messages. For each potential problem, CloudSeer outputs necessary context information including the affected task automaton and related log messages hinting where the problem occurs to help further diagnosis. Our experiments on OpenStack, a popular open-source cloud infrastructure, show that CloudSeer's efficiency and problem-detection capability are suitable for online monitoring.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Kwon:2016:LCI, author = "Yonghwi Kwon and Dohyeong Kim and William Nick Sumner and Kyungtae Kim and Brendan Saltaformaggio and Xiangyu Zhang and Dongyan Xu", title = "{LDX}: Causality Inference by Lightweight Dual Execution", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "503--515", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872395", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Causality inference, such as dynamic taint analysis, has many applications (e.g., information leak detection). It determines whether an event e is causally dependent on a preceding event c during execution. We develop a new causality inference engine LDX. Given an execution, it spawns a slave execution, in which it mutates c and observes whether any change is induced at e. To preclude non-determinism, LDX couples the executions by sharing syscall outcomes. To handle path differences induced by the perturbation, we develop a novel on-the-fly execution alignment scheme that maintains a counter to reflect the progress of execution. The scheme relies on program analysis and compiler transformation. LDX can effectively detect information leak and security attacks with an average overhead of 6.08\% while running the master and the slave concurrently on separate CPUs, much lower than existing systems that require instruction level monitoring. Furthermore, it has much better accuracy in causality inference.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Leesatapornwongsa:2016:TTN, author = "Tanakorn Leesatapornwongsa and Jeffrey F. Lukman and Shan Lu and Haryadi S. Gunawi", title = "{TaxDC}: a Taxonomy of Non-Deterministic Concurrency Bugs in Datacenter Distributed Systems", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "517--530", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872374", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We present TaxDC, the largest and most comprehensive taxonomy of non-deterministic concurrency bugs in distributed systems. We study 104 distributed concurrency (DC) bugs from four widely-deployed cloud-scale datacenter distributed systems, Cassandra, Hadoop MapReduce, HBase and ZooKeeper. We study DC-bug characteristics along several axes of analysis such as the triggering timing condition and input preconditions, error and failure symptoms, and fix strategies, collectively stored as 2,083 classification labels in TaxDC database. We discuss how our study can open up many new research directions in combating DC bugs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Mao:2016:RFR, author = "Junjie Mao and Yu Chen and Qixue Xiao and Yuanchun Shi", title = "{RID}: Finding Reference Count Bugs with Inconsistent Path Pair Checking", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "531--544", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872389", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Reference counts are widely used in OS kernels for resource management. However, reference counts are not trivial to be used correctly in large scale programs because it is left to developers to make sure that an increment to a reference count is always paired with a decrement. This paper proposes inconsistent path pair checking, a novel technique that can statically discover bugs related to reference counts without knowing how reference counts should be changed in a function. A prototype called RID is implemented and evaluations show that RID can discover more than 80 bugs which were confirmed by the developers in the latest Linux kernel. The results also show that RID tends to reveal bugs caused by developers' misunderstanding on API specifications or error conditions that are not handled properly.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Zhang:2016:MPU, author = "Huazhe Zhang and Henry Hoffmann", title = "Maximizing Performance Under a Power Cap: a Comparison of Hardware, Software, and Hybrid Techniques", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "545--559", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872375", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Power and thermal dissipation constrain multicore performance scaling. Modern processors are built such that they could sustain damaging levels of power dissipation, creating a need for systems that can implement processor power caps. A particular challenge is developing systems that can maximize performance within a power cap, and approaches have been proposed in both software and hardware. Software approaches are flexible, allowing multiple hardware resources to be coordinated for maximum performance, but software is slow, requiring a long time to converge to the power target. In contrast, hardware power capping quickly converges to the the power cap, but only manages voltage and frequency, limiting its potential performance. In this work we propose PUPiL, a hybrid software/hardware power capping system. Unlike previous approaches, PUPiL combines hardware's fast reaction time with software's flexibility. We implement PUPiL on real Linux/x86 platform and compare it to Intel's commercial hardware power capping system for both single and multi-application workloads. We find PUPiL provides the same reaction time as Intel's hardware with significantly higher performance. On average, PUPiL outperforms hardware by from 1:18-2:4 depending on workload and power target. Thus, PUPiL provides a promising way to enforce power caps with greater performance than current state-of-the-art hardware-only approaches.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Fan:2016:CSG, author = "Songchun Fan and Seyed Majid Zahedi and Benjamin C. Lee", title = "The Computational Sprinting Game", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "561--575", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872383", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Computational sprinting is a class of mechanisms that boost performance but dissipate additional power. We describe a sprinting architecture in which many, independent chip multiprocessors share a power supply and sprints are constrained by the chips' thermal limits and the rack's power limits. Moreover, we present the computational sprinting game, a multi-agent perspective on managing sprints. Strategic agents decide whether to sprint based on application phases and system conditions. The game produces an equilibrium that improves task throughput for data analytics workloads by 4-6$ \times $ over prior greedy heuristics and performs within 90\% of an upper bound on throughput from a globally optimized policy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Colin:2016:EIF, author = "Alexei Colin and Graham Harvey and Brandon Lucia and Alanson P. Sample", title = "An Energy-interference-free Hardware-Software Debugger for Intermittent Energy-harvesting Systems", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "577--589", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872409", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Energy-autonomous computing devices have the potential to extend the reach of computing to a scale beyond either wired or battery-powered systems. However, these devices pose a unique set of challenges to application developers who lack both hardware and software support tools. Energy harvesting devices experience power intermittence which causes the system to reset and power-cycle unpredictably, tens to hundreds of times per second. This can result in code execution errors that are not possible in continuously-powered systems and cannot be diagnosed with conventional debugging tools such as JTAG and/or oscilloscopes. We propose the Energy-interference-free Debugger, a hardware and software platform for monitoring and debugging intermittent systems without adversely effecting their energy state. The Energy-interference-free Debugger re-creates a familiar debugging environment for intermittent software and augments it with debugging primitives for effective diagnosis of intermittence bugs. Our evaluation of the Energy-interference-free Debugger quantifies its energy-interference-freedom and shows its value in a set of debugging tasks in complex test programs and several real applications, including RFID code and a machine-learning-based activity recognition system.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Witchel:2016:PPW, author = "Emmett Witchel", title = "Programmer Productivity in a World of Mushy Interfaces: Challenges of the Post-{ISA} Reality", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "591--591", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2876511", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Since 1964, we had the notion that the instruction set architecture (ISA) is a useful and fairly opaque abstraction layer between hardware and software. Software rode hardware's performance wave while remaining gloriously oblivious to hardware's growing complexity. Unfortunately, the jig is up. We still have ISAs, but the abstraction no longer offers seamless portability---parallel software needs to be tuned for different core counts, and heterogeneous processing elements (CPUs, GPUs, accelerators) further complicate programmability. We are better at building large-scale heterogeneous processors than we are at programming them. Maintaining software across multiple current platforms is difficult and porting to future platforms is also difficult. There have been many technical responses: virtual ISAs (e.g., NVIDIA's PTX), higher-level programming interfaces (e.g., CUDA or OpenCL), and late-stage compilation and platform-specific tailoring (e.g., Android ART), etc. A team of opinionated experts, drawn from the three ASPLOS communities will examine the problem of programmer productivity in the post-ISA world, first from the perspective of their area of expertise and then noting the contributions from the other two communities. What research will save us and how? This wide-ranging debate will frame important research areas for future work while being grounded in frank discussion about what has succeeded in the past. Attendees can expect actionable insight into important research issues as well an entertaining discussion.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Angstadt:2016:RPP, author = "Kevin Angstadt and Westley Weimer and Kevin Skadron", title = "{RAPID} Programming of Pattern-Recognition Processors", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "593--605", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872393", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We present RAPID, a high-level programming language and combined imperative and declarative model for programming pattern-recognition processors, such as Micron's Automata Processor (AP). The AP is a novel, non-Von Neumann architecture for direct execution of non-deterministic finite automata (NFAs), and has been demonstrated to provide substantial speedup for a variety of data-processing applications. RAPID is clear, maintainable, concise, and efficient both at compile and run time. Language features, such as code abstraction and parallel control structures, map well to pattern-matching problems, providing clarity and maintainability. For generation of efficient runtime code, we present algorithms to convert RAPID programs into finite automata. Further, we introduce a tessellation technique for configuring the AP, which significantly reduces compile time, increases programmer productivity, and improves maintainability. We evaluate five RAPID programs against custom, baseline implementations previously demonstrated to be significantly accelerated by the AP. We find that RAPID programs are much shorter in length, are expressible at a higher level of abstraction than their handcrafted counterparts, and yield generated code that is often more compact. In addition, our tessellation technique for configuring the AP has comparable device utilization to, and results in compilation that is up to four orders of magnitude faster than, current solutions.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Sui:2016:PCA, author = "Xin Sui and Andrew Lenharth and Donald S. Fussell and Keshav Pingali", title = "Proactive Control of Approximate Programs", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "607--621", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872402", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Approximate computing trades off accuracy of results for resources such as energy or computing time. There is a large and rapidly growing literature on approximate computing that has focused mostly on showing the benefits of approximate computing. However, we know relatively little about how to control approximation in a disciplined way. In this paper, we address the problem of controlling approximation for non-streaming programs that have a set of ``knobs'' that can be dialed up or down to control the level of approximation of different components in the program. We formulate this control problem as a constrained optimization problem, and describe a system called Capri that uses machine learning to learn cost and error models for the program, and uses these models to determine, for a desired level of approximation, knob settings that optimize metrics such as running time or energy usage. Experimental results with complex benchmarks from different problem domains demonstrate the effectiveness of this approach.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Park:2016:ATC, author = "Jongse Park and Emmanuel Amaro and Divya Mahajan and Bradley Thwaites and Hadi Esmaeilzadeh", title = "{AxGames}: Towards Crowdsourcing Quality Target Determination in Approximate Computing", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "623--636", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872376", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Approximate computing trades quality of application output for higher efficiency and performance. Approximation is useful only if its impact on application output quality is acceptable to the users. However, there is a lack of systematic solutions and studies that explore users' perspective on the effects of approximation. In this paper, we seek to provide one such solution for the developers to probe and discover the boundary of quality loss that most users will deem acceptable. We propose AxGames, a crowdsourced solution that enables developers to readily infer a statistical common ground from the general public through three entertaining games. The users engage in these games by betting on their opinion about the quality loss of the final output while the AxGames framework collects statistics about their perceptions. The framework then statistically analyzes the results to determine the acceptable levels of quality for a pair of (application, approximation technique). The three games are designed such that they effectively capture quality requirements with various tradeoffs and contexts. To evaluate AxGames, we examine seven diverse applications that produce user perceptible outputs and cover a wide range of domains, including image processing, optical character recognition, speech to text conversion, and audio processing. We recruit 700 participants/users through Amazon's Mechanical Turk to play the games that collect statistics about their perception on different levels of quality. Subsequently, the AxGames framework uses the Clopper-Pearson exact method, which computes a binomial proportion confidence interval, to analyze the collected statistics for each level of quality. Using this analysis, AxGames can statistically project the quality level that satisfies a given percentage of users. The developers can use these statistical projections to tune the level of approximation based on the user experience. We find that the level of acceptable quality loss significantly varies across applications. For instance, to satisfy 90\% of users, the level of acceptable quality loss is 2\% for one application (image processing) and 26\% for another (audio processing). Moreover, the pattern with which the crowd responds to approximation takes significantly different shape and form depending on the class of applications. These results confirm the necessity of solutions that systematically explore the effect of approximation on the end user experience.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Bornholt:2016:DBA, author = "James Bornholt and Randolph Lopez and Douglas M. Carmean and Luis Ceze and Georg Seelig and Karin Strauss", title = "A {DNA}-Based Archival Storage System", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "637--649", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872397", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Demand for data storage is growing exponentially, but the capacity of existing storage media is not keeping up. Using DNA to archive data is an attractive possibility because it is extremely dense, with a raw limit of 1 exabyte/mm$^3$ (109 GB/mm$^3$ ), and long-lasting, with observed half-life of over 500 years. This paper presents an architecture for a DNA-based archival storage system. It is structured as a key-value store, and leverages common biochemical techniques to provide random access. We also propose a new encoding scheme that offers controllable redundancy, trading off reliability for density. We demonstrate feasibility, random access, and robustness of the proposed encoding with wet lab experiments involving 151 kB of synthesized DNA and a 42 kB random-access subset, and simulation experiments of larger sets calibrated to the wet lab experiments. Finally, we highlight trends in biotechnology that indicate the impending practicality of DNA storage for much larger datasets.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Prabhakar:2016:GCH, author = "Raghu Prabhakar and David Koeplinger and Kevin J. Brown and HyoukJoong Lee and Christopher {De Sa} and Christos Kozyrakis and Kunle Olukotun", title = "Generating Configurable Hardware from Parallel Patterns", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "651--665", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872415", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In recent years the computing landscape has seen an increasing shift towards specialized accelerators. Field programmable gate arrays (FPGAs) are particularly promising for the implementation of these accelerators, as they offer significant performance and energy improvements over CPUs for a wide class of applications and are far more flexible than fixed-function ASICs. However, FPGAs are difficult to program. Traditional programming models for reconfigurable logic use low-level hardware description languages like Verilog and VHDL, which have none of the productivity features of modern software languages but produce very efficient designs, and low-level software languages like C and OpenCL coupled with high-level synthesis (HLS) tools that typically produce designs that are far less efficient. Functional languages with parallel patterns are a better fit for hardware generation because they provide high-level abstractions to programmers with little experience in hardware design and avoid many of the problems faced when generating hardware from imperative languages. In this paper, we identify two important optimizations for using parallel patterns to generate efficient hardware: tiling and metapipelining. We present a general representation of tiled parallel patterns, and provide rules for automatically tiling patterns and generating metapipelines. We demonstrate experimentally that these optimizations result in speedups up to 39.4$ \times $ on a set of benchmarks from the data analytics domain.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Chang:2016:DLD, author = "Li-Wen Chang and Hee-Seok Kim and Wen-mei W. Hwu", title = "{DySel}: Lightweight Dynamic Selection for Kernel-based Data-parallel Programming Model", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "667--680", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872373", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The rising pressure for simultaneously improving performance and reducing power is driving more diversity into all aspects of computing devices. An algorithm that is well-matched to the target hardware can run multiple times faster and more energy efficiently than one that is not. The problem is complicated by the fact that a program's input also affects the appropriate choice of algorithm. As a result, software developers have been faced with the challenge of determining the appropriate algorithm for each potential combination of target device and data. This paper presents DySel, a novel runtime system for automating such determination for kernel-based data parallel programming models such as OpenCL, CUDA, OpenACC, and C++AMP. These programming models cover many applications that demand high performance in mobile, cloud and high-performance computing. DySel systematically deploys candidate kernels on a small portion of the actual data to determine which achieves the best performance for the hardware-data combination. The test-deployment, referred to as micro-profiling, contributes to the final execution result and incurs less than 8\% of overhead in the worst observed case when compared to an oracle. We show four major use cases where DySel provides significantly more consistent performance without tedious effort from the developer.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Chen:2016:BQA, author = "Quan Chen and Hailong Yang and Jason Mars and Lingjia Tang", title = "{Baymax}: {QoS} Awareness and Increased Utilization for Non-Preemptive Accelerators in Warehouse Scale Computers", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "681--696", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872368", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Modern warehouse-scale computers (WSCs) are being outfitted with accelerators to provide the significant compute required by emerging intelligent personal assistant (IPA) workloads such as voice recognition, image classification, and natural language processing. It is well known that the diurnal user access pattern of user-facing services provides a strong incentive to co-locate applications for better accelerator utilization and efficiency, and prior work has focused on enabling co-location on multicore processors. However, interference when co-locating applications on non-preemptive accelerators is fundamentally different than contention on multi-core CPUs and introduces a new set of challenges to reduce QoS violation. To address this open problem, we first identify the underlying causes for QoS violation in accelerator-outfitted servers. Our experiments show that queuing delay for the compute resources and PCI-e bandwidth contention for data transfer are the main two factors that contribute to the long tails of user-facing applications. We then present Baymax, a runtime system that orchestrates the execution of compute tasks from different applications and mitigates PCI-e bandwidth contention to deliver the required QoS for user-facing applications and increase the accelerator utilization. Using DjiNN, a deep neural network service, Sirius, an end-to-end IPA workload, and traditional applications on a Nvidia K40 GPU, our evaluation shows that Baymax improves the accelerator utilization by 91.3\% while achieving the desired 99\%-ile latency target for for user-facing applications. In fact, Baymax reduces the 99\%-ile latency of user-facing applications by up to 195x over default execution.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Nowatzki:2016:ABS, author = "Tony Nowatzki and Karthikeyan Sankaralingam", title = "Analyzing Behavior Specialized Acceleration", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "697--711", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872412", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Hardware specialization has become a promising paradigm for overcoming the inefficiencies of general purpose microprocessors. Of significant interest are Behavioral Specialized Accelerators (BSAs), which are designed to efficiently execute code with only certain properties, but remain largely configurable or programmable. The most important strength of BSAs --- their ability to target a wide variety of codes --- also makes their interactions and analysis complex, raising the following questions: can multiple BSAs be composed synergistically, what are their interactions with the general purpose core, and what combinations favor which workloads? From a methodological standpoint, BSAs are also challenging, as they each require ISA development, compiler and assembler extensions, and either simulator or RTL models. To study the potential of BSAs, we propose a novel modeling technique called the Transformable Dependence Graph (TDG) --- a higher level alternative to the time-consuming traditional compiler+simulator approach, while still enabling detailed microarchitectural models for both general cores and accelerators. We then propose a multi-BSA organization, called ExoCore, which we model and study using the TDG. A design space exploration reveals that an ExoCore organization can push designs beyond the established energy-performance frontiers for general purpose cores. For example, a 2-wide OOO processor with three BSAs matches the performance of a conventional 6-wide OOO core, has 40\% lower area, and is 2.6x more energy efficient.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Yoon:2016:PPI, author = "Man-Ki Yoon and Negin Salajegheh and Yin Chen and Mihai Christodorescu", title = "{PIFT}: Predictive Information-Flow Tracking", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "713--725", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872403", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Phones today carry sensitive information and have a great number of ways to communicate that data. As a result, malware that steal money, information, or simply disable functionality have hit the app stores. Current security solutions for preventing undesirable data leaks are mostly high-overhead and have not been practical enough for smartphones. In this paper, we show that simply monitoring just some instructions (only memory loads and stores) it is possible to achieve low overhead, highly accurate information flow tracking. Our method achieves 98\% accuracy (0\% false positive and 2\% false negative) over DroidBench and was able to successfully catch seven real-world malware instances that steal phone number, location, and device ID using SMS messages and HTTP connections.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Venkat:2016:HHI, author = "Ashish Venkat and Sriskanda Shamasunder and Hovav Shacham and Dean M. Tullsen", title = "{HIPStR}: Heterogeneous-{ISA} Program State Relocation", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "727--741", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872408", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Heterogeneous Chip Multiprocessors have been shown to provide significant performance and energy efficiency gains over homogeneous designs. Recent research has expanded the dimensions of heterogeneity to include diverse Instruction Set Architectures, called Heterogeneous-ISA Chip Multiprocessors. This work leverages such an architecture to realize substantial new security benefits, and in particular, to thwart Return-Oriented Programming. This paper proposes a novel security defense called HIPStR --- Heterogeneous-ISA Program State Relocation --- that performs dynamic randomization of run-time program state, both within and across ISAs. This technique outperforms the state-of-the-art just-in-time code reuse (JIT-ROP) defense by an average of 15.6\%, while simultaneously providing greater security guarantees against classic return-into-libc, ROP, JOP, brute force, JIT-ROP, and several evasive variants.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Aweke:2016:ASB, author = "Zelalem Birhanu Aweke and Salessawi Ferede Yitbarek and Rui Qiao and Reetuparna Das and Matthew Hicks and Yossi Oren and Todd Austin", title = "{ANVIL}: Software-Based Protection Against Next-Generation Rowhammer Attacks", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "743--755", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872390", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Ensuring the integrity and security of the memory system is critical. Recent studies have shown serious security concerns due to ``rowhammer'' attacks, where repeated accesses to a row of memory cause bit flips in adjacent rows. Recent work by Google's Project Zero has shown how to leverage rowhammer-induced bit-flips as the basis for security exploits that include malicious code injection and memory privilege escalation. Being an important security concern, industry has attempted to defend against rowhammer attacks. Deployed defenses employ two strategies: (1) doubling the system DRAM refresh rate and (2) restricting access to the CLFLUSH instruction that attackers use to bypass the cache to increase memory access frequency (i.e., the rate of rowhammering). We demonstrate that such defenses are inadequate: we implement rowhammer attacks that both avoid using the CLFLUSH instruction and cause bit flips with a doubled refresh rate. Our next-generation CLFLUSH-free rowhammer attack bypasses the cache by manipulating cache replacement state to allow frequent misses out of the last-level cache to DRAM rows of our choosing. To protect existing systems from more advanced rowhammer attacks, we develop a software-based defense, ANVIL, which thwarts all known rowhammer attacks on existing systems. ANVIL detects rowhammer attacks by tracking the locality of DRAM accesses using existing hardware performance counters. Our detector identifies the rows being frequently accessed (i.e., the aggressors), then selectively refreshes the nearby victim rows to prevent hammering. Experiments running on real hardware with the SPEC2006 benchmarks show that ANVIL has less than a 1\% false positive rate and an average slowdown of 1\%. ANVIL is low-cost and robust, and our experiments indicate that it is an effective approach for protecting existing and future systems from even advanced rowhammer attacks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Didona:2016:PAM, author = "Diego Didona and Nuno Diegues and Anne-Marie Kermarrec and Rachid Guerraoui and Ricardo Neves and Paolo Romano", title = "{ProteusTM}: Abstraction Meets Performance in Transactional Memory", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "757--771", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872385", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The Transactional Memory (TM) paradigm promises to greatly simplify the development of concurrent applications. This led, over the years, to the creation of a plethora of TM implementations delivering wide ranges of performance across workloads. Yet, no universal implementation fits each and every workload. In fact, the best TM in a given workload can reveal to be disastrous for another one. This forces developers to face the complex task of tuning TM implementations, which significantly hampers their wide adoption. In this paper, we address the challenge of automatically identifying the best TM implementation for a given workload. Our proposed system, ProteusTM, hides behind the TM interface a large library of implementations. Underneath, it leverages a novel multi-dimensional online optimization scheme, combining two popular learning techniques: Collaborative Filtering and Bayesian Optimization. We integrated ProteusTM in GCC and demonstrate its ability to switch between TMs and adapt several configuration parameters (e.g., number of threads). We extensively evaluated ProteusTM, obtaining average performance {$<$3}\% from optimal, and gains up to 100x over static alternatives.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Shalev:2016:CCS, author = "Noam Shalev and Eran Harpaz and Hagar Porat and Idit Keidar and Yaron Weinsberg", title = "{CSR}: Core Surprise Removal in Commodity Operating Systems", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "773--787", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872369", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "One of the adverse effects of shrinking transistor sizes is that processors have become increasingly prone to hardware faults. At the same time, the number of cores per die rises. Consequently, core failures can no longer be ruled out, and future operating systems for many-core machines will have to incorporate fault tolerance mechanisms. We present CSR, a strategy for recovery from unexpected permanent processor faults in commodity operating systems. Our approach overcomes surprise removal of faulty cores, and also tolerates cascading core failures. When a core fails in user mode, CSR terminates the process executing on that core and migrates the remaining processes in its run-queue to other cores. We further show how hardware transactional memory may be used to overcome failures in critical kernel code. Our solution is scalable, incurs low overhead, and is designed to integrate into modern operating systems. We have implemented it in the Linux kernel, using Haswell's Transactional Synchronization Extension, and tested it on a real system.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Gangwani:2016:CBS, author = "Tanmay Gangwani and Adam Morrison and Josep Torrellas", title = "{CASPAR}: Breaking Serialization in Lock-Free Multicore Synchronization", journal = j-COMP-ARCH-NEWS, volume = "44", number = "2", pages = "789--804", month = may, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2980024.2872400", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:42 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In multicores, performance-critical synchronization is increasingly performed in a lock-free manner using atomic instructions such as CAS or LL/SC. However, when many processors synchronize on the same variable, performance can still degrade significantly. Contending writes get serialized, creating a non-scalable condition. Past proposals that build hardware queues of synchronizing processors do not fundamentally solve this problem---at best, they help to efficiently serialize the contending writes. This paper proposes a novel architecture that breaks the serialization of hardware queues and enables the queued processors to perform lock-free synchronization in parallel. The architecture, called CASPAR, is able to (1) execute the CASes in the queued-up processors in parallel through eager forwarding of expected values, and (2) validate the CASes in parallel and dequeue groups of processors at a time. The result is highly-scalable synchronization. We evaluate CASPAR with simulations of a 64-core chip. Compared to existing proposals with hardware queues, CASPAR improves the throughput of kernels by 32\% on average, and reduces the execution time of the sections considered in lock-free versions of applications by 47\% on average. This makes these sections 2.5x faster than in the original applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'16 conference proceedings.", } @Article{Albericio:2016:CIN, author = "Jorge Albericio and Patrick Judd and Tayler Hetherington and Tor Aamodt and Natalie Enright Jerger and Andreas Moshovos", title = "{Cnvlutin}: ineffectual-neuron-free deep neural network computing", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "1--13", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001138", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This work observes that a large fraction of the computations performed by Deep Neural Networks (DNNs) are intrinsically ineffectual as they involve a multiplication where one of the inputs is zero. This observation motivates Cnvlutin ( CNV ), a value-based approach to hardware acceleration that eliminates most of these ineffectual operations, improving performance and energy over a state-of-the-art accelerator with no accuracy loss. CNV uses hierarchical data-parallel units, allowing groups of lanes to proceed mostly independently enabling them to skip over the ineffectual computations. A co-designed data storage format encodes the computation elimination decisions taking them off the critical path while avoiding control divergence in the data parallel units. Combined, the units and the data storage format result in a data-parallel architecture that maintains wide, aligned accesses to its memory hierarchy and that keeps its data lanes busy. By loosening the ineffectual computation identification criterion, CNV enables further performance and energy efficiency improvements, and more so if a loss in accuracy is acceptable. Experimental measurements over a set of state-of-the-art DNNs for image classification show that CNV improves performance over a state-of-the-art accelerator from 1.24$ \times $ to 1.55$ \times $ and by 1.37$ \times $ on average without any loss in accuracy by removing zero-valued operand multiplications alone. While CNV incurs an area overhead of 4.49\%, it improves overall EDP (Energy Delay Product) and ED$^2$ P (Energy Delay Squared Product) on average by 1.47$ \times $ and 2.01$ \times $, respectively. The average performance improvements increase to 1.52$ \times $ without any loss in accuracy with a broader ineffectual identification policy. Further improvements are demonstrated with a loss in accuracy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Shafiee:2016:ICN, author = "Ali Shafiee and Anirban Nag and Naveen Muralimanohar and Rajeev Balasubramonian and John Paul Strachan and Miao Hu and R. Stanley Williams and Vivek Srikumar", title = "{ISAAC}: a convolutional neural network accelerator with in-situ analog arithmetic in crossbars", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "14--26", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001139", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A number of recent efforts have attempted to design accelerators for popular machine learning algorithms, such as those involving convolutional and deep neural networks (CNNs and DNNs). These algorithms typically involve a large number of multiply-accumulate (dot-product) operations. A recent project, DaDianNao, adopts a near data processing approach, where a specialized neural functional unit performs all the digital arithmetic operations and receives input weights from adjacent eDRAM banks. This work explores an in-situ processing approach, where memristor crossbar arrays not only store input weights, but are also used to perform dot-product operations in an analog manner. While the use of crossbar memory as an analog dot-product engine is well known, no prior work has designed or characterized a full-fledged accelerator based on crossbars. In particular, our work makes the following contributions: (i) We design a pipelined architecture, with some crossbars dedicated for each neural network layer, and eDRAM buffers that aggregate data between pipeline stages. (ii) We define new data encoding techniques that are amenable to analog computations and that can reduce the high overheads of analog-to-digital conversion (ADC). (iii) We define the many supporting digital components required in an analog CNN accelerator and carry out a design space exploration to identify the best balance of memristor storage/compute, ADCs, and eDRAM storage on a chip. On a suite of CNN and DNN workloads, the proposed ISAAC architecture yields improvements of 14.8$ \times $, 5.5$ \times $, and 7.5$ \times $ in throughput, energy, and computational density (respectively), relative to the state-of-the-art DaDianNao architecture.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Chi:2016:PNP, author = "Ping Chi and Shuangchen Li and Cong Xu and Tao Zhang and Jishen Zhao and Yongpan Liu and Yu Wang and Yuan Xie", title = "{PRIME}: a novel processing-in-memory architecture for neural network computation in {ReRAM}-based main memory", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "27--39", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001140", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Processing-in-memory (PIM) is a promising solution to address the ``memory wall'' challenges for future computer systems. Prior proposed PIM architectures put additional computation logic in or near memory. The emerging metal-oxide resistive random access memory (ReRAM) has showed its potential to be used for main memory. Moreover, with its crossbar array structure, ReRAM can perform matrix-vector multiplication efficiently, and has been widely studied to accelerate neural network (NN) applications. In this work, we propose a novel PIM architecture, called PRIME, to accelerate NN applications in ReRAM based main memory. In PRIME, a portion of ReRAM crossbar arrays can be configured as accelerators for NN applications or as normal memory for a larger memory space. We provide microarchitecture and circuit designs to enable the morphable functions with an insignificant area overhead. We also design a software/hardware interface for software developers to implement various NNs on PRIME. Benefiting from both the PIM architecture and the efficiency of using ReRAM for NN computation, PRIME distinguishes itself from prior work on NN acceleration, with significant performance improvement and energy saving. Our experimental results show that, compared with a state-of-the-art neural processing unit design, PRIME improves the performance by ~2360$ \times $ and the energy consumption by ~895$ \times $, across the evaluated machine learning benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Torng:2016:AAW, author = "Christopher Torng and Moyang Wang and Christopher Batten", title = "Asymmetry-aware work-stealing runtimes", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "40--52", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001142", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Amdahl's law provides architects a compelling reason to introduce system asymmetry to optimize for both serial and parallel regions of execution. Asymmetry in a multicore processor can arise statically (e.g., from core microarchitecture) or dynamically (e.g., applying dynamic voltage/frequency scaling). Work stealing is an increasingly popular approach to task distribution that elegantly balances task-based parallelism across multiple worker threads. In this paper, we propose asymmetry-aware work-stealing (AAWS) runtimes, which are carefully designed to exploit both the static and dynamic asymmetry in modern systems. AAWS runtimes use three key hardware/software techniques: work-pacing, work-sprinting, and work-mugging. Work-pacing and work-sprinting are novel techniques that combine a marginal-utility-based approach with integrated voltage regulators to improve performance and energy efficiency in high- and low-parallel regions. Work-mugging is a previously proposed technique that enables a waiting big core to preemptively migrate work from a busy little core. We propose a simple implementation of work-mugging based on lightweight user-level interrupts. We use a vertically integrated research methodology spanning software, architecture, and VLSI to make the case that holistically combining static asymmetry, dynamic asymmetry, and work-stealing runtimes can improve both performance and energy efficiency in future multicore systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Tseng:2016:MCA, author = "Hung-Wei Tseng and Qianchen Zhao and Yuxiao Zhou and Mark Gahagan and Steven Swanson", title = "{Morpheus}: creating application objects efficiently for heterogeneous computing", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "53--65", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001143", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In high performance computing systems, object deserialization can become a surprisingly important bottleneck---in our test, a set of general-purpose, highly parallelized applications spends 64\% of total execution time deserializing data into objects. This paper presents the Morpheus model, which allows applications to move such computations to a storage device. We use this model to deserialize data into application objects inside storage devices, rather than in the host CPU. Using the Morpheus model for object deserialization avoids unnecessary system overheads, frees up scarce CPU and main memory resources for compute-intensive workloads, saves I/O bandwidth, and reduces power consumption. In heterogeneous, co-processor-equipped systems, Morpheus allows application objects to be sent directly from a storage device to a co-processor (e.g., a GPU) by peer-to-peer transfer, further improving application performance as well as reducing the CPU and main memory utilizations. This paper implements Morpheus-SSD, an SSD supporting the Morpheus model. Morpheus-SSD improves the performance of object deserialization by 1.66$ \times $, reduces power consumption by 7\%, uses 42\% less energy, and speeds up the total execution time by 1.32$ \times $. By using NVMe-P2P that realizes peer-to-peer communication between Morpheus-SSD and a GPU, Morpheus-SSD can speed up the total execution time by 1.39$ \times $ in a heterogeneous computing platform.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Mahajan:2016:TSG, author = "Divya Mahajan and Amir Yazdanbakhsh and Jongse Park and Bradley Thwaites and Hadi Esmaeilzadeh", title = "Towards statistical guarantees in controlling quality tradeoffs for approximate acceleration", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "66--77", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001144", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Conventionally, an approximate accelerator replaces every invocation of a frequently executed region of code without considering the final quality degradation. However, there is a vast decision space in which each invocation can either be delegated to the accelerator---improving performance and efficiency--or run on the precise core---maintaining quality. In this paper we introduce Mithra, a co-designed hardware-software solution, that navigates these tradeoffs to deliver high performance and efficiency while lowering the final quality loss. Mithra seeks to identify whether each individual accelerator invocation will lead to an undesirable quality loss and, if so, directs the processor to run the original precise code. This identification is cast as a binary classification task that requires a cohesive co-design of hardware and software. The hardware component performs the classification at runtime and exposes a knob to the software mechanism to control quality tradeoffs. The software tunes this knob by solving a statistical optimization problem that maximizes benefits from approximation while providing statistical guarantees that final quality level will be met with high confidence. The software uses this knob to tune and train the hardware classifiers. We devise two distinct hardware classifiers, one table-based and one neural network based. To understand the efficacy of these mechanisms, we compare them with an ideal, but infeasible design, the oracle. Results show that, with 95\% confidence the table-based design can restrict the final output quality loss to 5\% for 90\% of unseen input sets while providing 2.5$ \times $ speedup and 2.6$ \times $ energy efficiency. The neural design shows similar speedup however, improves the efficiency by 13\%. Compared to the table-based design, the oracle improves speedup by 26\% and efficiency by 36\%. These results show that Mithra performs within a close range of the oracle and can effectively navigate the quality tradeoffs in approximate acceleration.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Jain:2016:BFL, author = "Akanksha Jain and Calvin Lin", title = "Back to the future: leveraging {Belady}'s algorithm for improved cache replacement", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "78--89", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001146", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Belady's algorithm is optimal but infeasible because it requires knowledge of the future. This paper explains how a cache replacement algorithm can nonetheless learn from Belady's algorithm by applying it to past cache accesses to inform future cache replacement decisions. We show that the implementation is surprisingly efficient, as we introduce a new method of efficiently simulating Belady's behavior, and we use known sampling techniques to compactly represent the long history information that is needed for high accuracy. For a 2MB LLC, our solution uses a 16KB hardware budget (excluding replacement state in the tag array). When applied to a memory-intensive subset of the SPEC 2006 CPU benchmarks, our solution improves performance over LRU by 8.4\%, as opposed to 6.2\% for the previous state-of-the-art. For a 4-core system with a shared 8MB LLC, our solution improves performance by 15.0\%, compared to 12.0\% for the previous state-of-the-art.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Park:2016:ESFa, author = "Caching Hyun Park and Taekyung Heo and Jaehyuk Huh", title = "Efficient synonym filtering and scalable delayed translation for hybrid virtual", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "90--102", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001147", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Conventional translation look-aside buffers (TLBs) are required to complete address translation with short latencies, as the address translation is on the critical path of all memory accesses even for L1 cache hits. Such strict TLB latency restrictions limit the TLB capacity, as the latency increase with large TLBs may lower the overall performance even with potential TLB miss reductions. Furthermore, TLBs consume a significant amount of energy as they are accessed for every instruction fetch and data access. To avoid the latency restriction and reduce the energy consumption, virtual caching techniques have been proposed to defer translation to after L1 cache misses. However, an efficient solution for the synonym problem has been a critical issue hindering the wide adoption of virtual caching. Based on the virtual caching concept, this study proposes a hybrid virtual memory architecture extending virtual caching to the entire cache hierarchy, aiming to improve both performance and energy consumption. The hybrid virtual caching uses virtual addresses augmented with address space identifiers (ASID) in the cache hierarchy for common non-synonym addresses. For such non-synonyms, the address translation occurs only after last-level cache (LLC) misses. For uncommon synonym addresses, the addresses are translated to physical addresses with conventional TLBs before L1 cache accesses. To support such hybrid translation, we propose an efficient synonym detection mechanism based on Bloom filters which can identify synonym candidates with few false positives. For large memory applications, delayed translation alone cannot solve the address translation problem, as fixed-granularity delayed TLBs may not scale with the increasing memory requirements. To mitigate the translation scalability problem, this study proposes a delayed many segment translation designed for the hybrid virtual caching. The experimental results show that our approach effectively lowers accesses to the TLBs, leading to significant power savings. In addition, the approach provides performance improvement with scalable delayed translation with variable length segments.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Cheng:2016:LLB, author = "Hsiang-Yun Cheng and Jishen Zhao and Jack Sampson and Mary Jane Irwin and Aamer Jaleel and Yu Lu and Yuan Xie", title = "{LAP}: loop-block aware inclusion properties for energy-efficient asymmetric last level caches", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "103--114", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001148", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Emerging non-volatile memory (NVM) technologies, such as spin-transfer torque RAM (STT-RAM), are attractive options for replacing or augmenting SRAM in implementing last-level caches (LLCs). However, the asymmetric read/write energy and latency associated with NVM introduces new challenges in designing caches where, in contrast to SRAM, dynamic energy from write operations can be responsible for a larger fraction of total cache energy than leakage. These properties lead to the fact that no single traditional inclusion policy being dominant in terms of LLC energy consumption for asymmetric LLCs. We propose a novel selective inclusion policy, Loop-block-Aware Policy ( LAP ), to reduce energy consumption in LLCs with asymmetric read/write properties. In order to eliminate redundant writes to the LLC, LAP incorporates advantages from both non-inclusive and exclusive designs to selectively cache only part of upper-level data in the LLC. Results show that LAP outperforms other variants of selective inclusion policies and consumes 20\% and 12\% less energy than non-inclusive and exclusive STT-RAM-based LLCs, respectively. We extend LAP to a system with SRAM/STT-RAM hybrid LLCs to achieve energy-efficient data placement, reducing the energy consumption by 22\% and 15\% over non-inclusion and exclusion on average, with average-case performance improvements, small worst-case performance loss, and minimal hardware overheads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Koeplinger:2016:AGE, author = "David Koeplinger and Christina Delimitrou and Raghu Prabhakar and Christos Kozyrakis and Yaqi Zhang and Kunle Olukotun", title = "Automatic generation of efficient accelerators for reconfigurable hardware", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "115--127", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001150", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Acceleration in the form of customized datapaths offer large performance and energy improvements over general purpose processors. Reconfigurable fabrics such as FPGAs are gaining popularity for use in implementing application-specific accelerators, thereby increasing the importance of having good high-level FPGA design tools. However, current tools for targeting FPGAs offer inadequate support for high-level programming, resource estimation, and rapid and automatic design space exploration. We describe a design framework that addresses these challenges. We introduce a new representation of hardware using parameterized templates that captures locality and parallelism information at multiple levels of nesting. This representation is designed to be automatically generated from high-level languages based on parallel patterns. We describe a hybrid area estimation technique which uses template-level models and design-level artificial neural networks to account for effects from hardware place-and-route tools, including routing overheads, register and block RAM duplication, and LUT packing. Our runtime estimation accounts for off-chip memory accesses. We use our estimation capabilities to rapidly explore a large space of designs across tile sizes, parallelization factors, and optional coarse-grained pipelining, all at multiple loop levels. We show that estimates average 4.8\% error for logic resources, 6.1\% error for runtimes, and are 279 to 6533 times faster than a commercial high-level synthesis tool. We compare the best-performing designs to optimized CPU code running on a server-grade 6 core processor and show speedups of up to 16.7$ \times $.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Kim:2016:SFA, author = "Donggyu Kim and Adam Izraelevitz and Christopher Celio and Hokeun Kim and Brian Zimmer and Yunsup Lee and Jonathan Bachrach and Krste Asanovi{\'c}", title = "{Strober}: fast and accurate sample-based energy simulation for arbitrary {RTL}", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "128--139", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001151", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents a sample-based energy simulation methodology that enables fast and accurate estimations of performance and average power for arbitrary RTL designs. Our approach uses an FPGA to simultaneously simulate the performance of an RTL design and to collect samples containing exact RTL state snapshots. Each snapshot is then replayed in gate-level simulation, resulting in a workload-specific average power estimate with confidence intervals. For arbitrary RTL and workloads, our methodology guarantees a minimum of four-orders-of-magnitude speedup over commercial CAD gate-level simulation tools and gives average energy estimates guaranteed to be within 5\% of the true average energy with 99\% confidence. We believe our open-source sample-based energy simulation tool Strober can not only rapidly provide ground truth for more abstract power models, but can enable productive design-space exploration early in the RTL design process.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Laurenzano:2016:PIM, author = "Michael A. Laurenzano and Yunqi Zhang and Jiang Chen and Lingjia Tang and Jason Mars", title = "{PowerChop}: identifying and managing non-critical units in hybrid processor architectures", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "140--152", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001152", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "On-core microarchitectural structures consume significant portions of a processor's power budget. However, depending on application characteristics, those structures do not always provide (much) performance benefit. While timeout-based power gating techniques have been leveraged for underutilized cores and inactive functional units, these techniques have not directly translated to high-activity units such as vector processing units, complex branch predictors, and caches. The performance benefit provided by these units does not necessarily correspond with unit activity, but instead is a function of application characteristics. This work introduces PowerChop, a novel technique that leverages the unique capabilities of HW/SW co-designed hybrid processors to enact unit-level power management at the application phase level. PowerChop adds two small additional hardware units to facilitate phase identification and triggering different power states, enabling the software layer to cheaply track, predict and take advantage of varying unit criticality across application phases by powering gating units that are not needed for performant execution. Through detailed experimentation, we find that PowerChop significantly decreases power consumption, reducing the leakage power of a hybrid server processor by 9\% on average (up to 33\%) and a hybrid mobile processor by 19\% (up to 40\%) while introducing just 2\% slowdown.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Gu:2016:BFN, author = "Boncheol Gu and Andre S. Yoon and Duck-Ho Bae and Insoon Jo and Jinyoung Lee and Jonghyun Yoon and Jeong-Uk Kang and Moonsang Kwon and Chanho Yoon and Sangyeun Cho and Jaeheon Jeong and Duckhyun Chang", title = "{Biscuit}: a framework for near-data processing of big data workloads", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "153--165", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001154", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Data-intensive queries are common in business intelligence, data warehousing and analytics applications. Typically, processing a query involves full inspection of large in-storage data sets by CPUs. An intuitive way to speed up such queries is to reduce the volume of data transferred over the storage network to a host system. This can be achieved by filtering out extraneous data within the storage, motivating a form of near-data processing. This work presents Biscuit, a novel near-data processing framework designed for modern solid-state drives. It allows programmers to write a data-intensive application to run on the host system and the storage system in a distributed, yet seamless manner. In order to offer a high-level programming model, Biscuit builds on the concept of data flow. Data processing tasks communicate through typed and data-ordered ports. Biscuit does not distinguish tasks that run on the host system and the storage system. As the result, Biscuit has desirable traits like generality and expressiveness, while promoting code reuse and naturally exposing concurrency. We implement Biscuit on a host system that runs the Linux OS and a high-performance solid-state drive. We demonstrate the effectiveness of our approach and implementation with experimental results. When data filtering is done by hardware in the solid-state drive, the average speed-up obtained for the top five queries of TPC-H is over 15$ \times $.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Ozdal:2016:EEA, author = "Muhammet Mustafa Ozdal and Serif Yesil and Taemin Kim and Andrey Ayupov and John Greth and Steven Burns and Ozcan Ozturk", title = "Energy efficient architecture for graph analytics accelerators", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "166--177", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001155", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Specialized hardware accelerators can significantly improve the performance and power efficiency of compute systems. In this paper, we focus on hardware accelerators for graph analytics applications and propose a configurable architecture template that is specifically optimized for iterative vertex-centric graph applications with irregular access patterns and asymmetric convergence. The proposed architecture addresses the limitations of the existing multi-core CPU and GPU architectures for these types of applications. The SystemC-based template we provide can be customized easily for different vertex-centric applications by inserting application-level data structures and functions. After that, a cycle-accurate simulator and RTL can be generated to model the target hardware accelerators. In our experiments, we study several graph-parallel applications, and show that the hardware accelerators generated by our template can outperform a 24 core high end server CPU system by up to 3x in terms of performance. We also estimate the area requirement and power consumption of these hardware accelerators through physical-aware logic synthesis, and show up to 65x better power consumption with significantly smaller area.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Magaki:2016:ACS, author = "Ikuo Magaki and Moein Khazraee and Luis Vega Gutierrez and Michael Bedford Taylor", title = "{ASIC} clouds: specializing the datacenter", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "178--190", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001156", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "GPU and FPGA-based clouds have already demonstrated the promise of accelerating computing-intensive workloads with greatly improved power and performance. In this paper, we examine the design of ASIC Clouds, which are purpose-built datacenters comprised of large arrays of ASIC accelerators, whose purpose is to optimize the total cost of ownership (TCO) of large, high-volume chronic computations, which are becoming increasingly common as more and more services are built around the Cloud model. On the surface, the creation of ASIC clouds may seem highly improbable due to high NREs and the inflexibility of ASICs. Surprisingly, however, large-scale ASIC Clouds have already been deployed by a large number of commercial entities, to implement the distributed Bitcoin cryptocurrency system. We begin with a case study of Bitcoin mining ASIC Clouds, which are perhaps the largest ASIC Clouds to date. From there, we design three more ASIC Clouds, including a YouTube-style video transcoding ASIC Cloud, a Litecoin ASIC Cloud, and a Convolutional Neural Network ASIC Cloud and show 2-3 orders of magnitude better TCO versus CPU and GPU. Among our contributions, we present a methodology that given an accelerator design, derives Pareto-optimal ASIC Cloud Servers, by extracting data from place-and-routed circuits and computational fluid dynamic simulations, and then employing clever but brute-force search to find the best jointly-optimized ASIC, DRAM subsystem, motherboard, power delivery system, cooling system, operating voltage, and case design. Moreover, we show how data center parameters determine which of the many Pareto-optimal points is TCO-optimal. Finally we examine when it makes sense to build an ASIC Cloud, and examine the impact of ASIC NRE.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Oh:2016:AIC, author = "Yunho Oh and Keunsoo Kim and Myung Kuk Yoon and Jong Hyun Park and Yongjun Park and Won Woo Ro and Murali Annavaram", title = "{APRES}: improving cache efficiency by exploiting load characteristics on {GPUs}", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "191--203", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001158", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Long memory latency and limited throughput become performance bottlenecks of GPGPU applications. The latency takes hundreds of cycles which is difficult to be hidden by simply interleaving tens of warp execution. While cache hierarchy helps to reduce memory system pressure, massive Thread-Level Parallelism (TLP) often causes excessive cache contention. This paper proposes Adaptive PREfetching and Scheduling (APRES) to improve GPU cache efficiency. APRES relies on the following observations. First, certain static load instructions tend to generate memory addresses having very high locality. Second, although loads have no locality, the access addresses still can show highly strided access pattern. Third, the locality behavior tends to be consistent regardless of warp ID. APRES schedules warps so that as many cache hits generated as possible before any cache misses generated. This is to minimize cache thrashing when many warps are contending for a cache line. However, to realize this operation, it is required to predict which warp will hit the cache in the near future. Without directly predicting future cache hit/miss for each warp, APRES creates a group of warps that will execute the same load instruction in the near future. Based on the third observation, we expect the locality behavior is consistent over all warps in the group. If the first executed warp in the group hits the cache, then the load is considered as a high locality type, and APRES prioritizes all warps in the group. Group prioritization leads to consecutive cache hits, because the grouped warps are likely to access the same cache line. If the first warp missed the cache, then the load is considered as a strided type, and APRES generates prefetch requests for the other warps in the group. After that, APRES prioritizes prefetch targeted warps so that the demand requests are merged to Miss Status Holding Register (MSHR) or prefetched lines can be accessed. On memory-intensive applications, APRES achieves 31.7\% performance improvement compared to the baseline GPU and 7.2\% additional speedup compared to the best combination of existing warp scheduling and prefetching methods.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Hsieh:2016:TOM, author = "Kevin Hsieh and Eiman Ebrahimi and Gwangsun Kim and Niladrish Chatterjee and Mike O'Connor and Nandita Vijaykumar and Onur Mutlu and Stephen W. Keckler", title = "Transparent offloading and mapping {(TOM)}: enabling programmer-transparent near-data processing in {GPU} systems", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "204--216", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001159", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Main memory bandwidth is a critical bottleneck for modern GPU systems due to limited off-chip pin bandwidth. 3D-stacked memory architectures provide a promising opportunity to significantly alleviate this bottleneck by directly connecting a logic layer to the DRAM layers with high bandwidth connections. Recent work has shown promising potential performance benefits from an architecture that connects multiple such 3D-stacked memories and offloads bandwidth-intensive computations to a GPU in each of the logic layers. An unsolved key challenge in such a system is how to enable computation offloading and data mapping to multiple 3D-stacked memories without burdening the programmer such that any application can transparently benefit from near-data processing capabilities in the logic layer. Our paper develops two new mechanisms to address this key challenge. First, a compiler-based technique that automatically identifies code to offload to a logic-layer GPU based on a simple cost-benefit analysis. Second, a software/hardware cooperative mechanism that predicts which memory pages will be accessed by offloaded code, and places those pages in the memory stack closest to the offloaded code, to minimize off-chip bandwidth consumption. We call the combination of these two programmer-transparent mechanisms TOM: Transparent Offloading and Mapping. Our extensive evaluations across a variety of modern memory-intensive GPU workloads show that, without requiring any program modification, TOM significantly improves performance (by 30\% on average, and up to 76\%) compared to a baseline GPU system that cannot offload computation to 3D-stacked memories.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Park:2016:ESFb, author = "Chang Hyun Park and Taekyung Heo and Jaehyuk Huh", title = "Efficient synonym filtering and scalable delayed translation for hybrid virtual caching", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "217--229", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001160", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Conventional translation look-aside buffers (TLBs) are required to complete address translation with short latencies, as the address translation is on the critical path of all memory accesses even for L1 cache hits. Such strict TLB latency restrictions limit the TLB capacity, as the latency increase with large TLBs may lower the overall performance even with potential TLB miss reductions. Furthermore, TLBs consume a significant amount of energy as they are accessed for every instruction fetch and data access. To avoid the latency restriction and reduce the energy consumption, virtual caching techniques have been proposed to defer translation to after L1 cache misses. However, an efficient solution for the synonym problem has been a critical issue hindering the wide adoption of virtual caching. Based on the virtual caching concept, this study proposes a hybrid virtual memory architecture extending virtual caching to the entire cache hierarchy, aiming to improve both performance and energy consumption. The hybrid virtual caching uses virtual addresses augmented with address space identifiers (ASID) in the cache hierarchy for common non-synonym addresses. For such non-synonyms, the address translation occurs only after last-level cache (LLC) misses. For uncommon synonym addresses, the addresses are translated to physical addresses with conventional TLBs before L1 cache accesses. To support such hybrid translation, we propose an efficient synonym detection mechanism based on Bloom filters which can identify synonym candidates with few false positives. For large memory applications, delayed translation alone cannot solve the address translation problem, as fixed-granularity delayed TLBs may not scale with the increasing memory requirements. To mitigate the translation scalability problem, this study proposes a delayed many segment translation designed for the hybrid virtual caching. The experimental results show that our approach effectively lowers accesses to the TLBs, leading to significant power savings. In addition, the approach provides performance improvement with scalable delayed translation with variable length segments.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Xu:2016:WSE, author = "Qiumin Xu and Hyeran Jeon and Keunsoo Kim and Won Woo Ro and Murali Annavaram", title = "Warped-slicer: efficient intra-{SM} slicing through dynamic resource partitioning for {GPU} multiprogramming", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "230--242", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001161", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As technology scales, GPUs are forecasted to incorporate an ever-increasing amount of computing resources to support thread-level parallelism. But even with the best effort, exposing massive thread-level parallelism from a single GPU kernel, particularly from general purpose applications, is going to be a difficult challenge. In some cases, even if there is sufficient thread-level parallelism in a kernel, there may not be enough available memory bandwidth to support such massive concurrent thread execution. Hence, GPU resources may be underutilized as more general purpose applications are ported to execute on GPUs. In this paper, we explore multiprogramming GPUs as a way to resolve the resource underutilization issue. There is a growing hardware support for multiprogramming on GPUs. Hyper-Q has been introduced in the Kepler architecture which enables multiple kernels to be invoked via tens of hardware queue streams. Spatial multitasking has been proposed to partition GPU resources across multiple kernels. But the partitioning is done at the coarse granularity of streaming multiprocessors (SMs) where each kernel is assigned to a subset of SMs. In this paper, we advocate for partitioning a single SM across multiple kernels, which we term as intra-SM slicing. We explore various intra-SM slicing strategies that slice resources within each SM to concurrently run multiple kernels on the SM. Our results show that there is not one intra-SM slicing strategy that derives the best performance for all application pairs. We propose Warped-Slicer, a dynamic intra-SM slicing strategy that uses an analytical method for calculating the SM resource partitioning across different kernels that maximizes performance. The model relies on a set of short online profile runs to determine how each kernel's performance varies as more thread blocks from each kernel are assigned to an SM. The model takes into account the interference effect of shared resource usage across multiple kernels. The model is also computationally efficient and can determine the resource partitioning quickly to enable dynamic decision making as new kernels enter the system. We demonstrate that the proposed Warped-Slicer approach improves performance by 23\% over the baseline multiprogramming approach with minimal hardware overhead.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Han:2016:EEI, author = "Song Han and Xingyu Liu and Huizi Mao and Jing Pu and Ardavan Pedram and Mark A. Horowitz and William J. Dally", title = "{EIE}: efficient inference engine on compressed deep neural network", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "243--254", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001163", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "State-of-the-art deep neural networks (DNNs) have hundreds of millions of connections and are both computationally and memory intensive, making them difficult to deploy on embedded systems with limited hardware resources and power budgets. While custom hardware helps the computation, fetching weights from DRAM is two orders of magnitude more expensive than ALU operations, and dominates the required power. Previously proposed 'Deep Compression' makes it possible to fit large DNNs (AlexNet and VGGNet) fully in on-chip SRAM. This compression is achieved by pruning the redundant connections and having multiple connections share the same weight. We propose an energy efficient inference engine (EIE) that performs inference on this compressed network model and accelerates the resulting sparse matrix-vector multiplication with weight sharing. Going from DRAM to SRAM gives EIE 120$ \times $ energy saving; Exploiting sparsity saves 10$ \times $; Weight sharing gives 8$ \times $; Skipping zero activations from ReLU saves another 3$ \times $. Evaluated on nine DNN benchmarks, EIE is 189$ \times $ and 13$ \times $ faster when compared to CPU and GPU implementations of the same DNN without compression. EIE has a processing power of 102 GOPS working directly on a compressed network, corresponding to 3 TOPS on an uncompressed network, and processes FC layers of AlexNet at 1.88$ \times $10$^4$ frames/sec with a power dissipation of only 600mW. It is 24,000$ \times $ and 3,400$ \times $ more energy efficient than a CPU and GPU respectively. Compared with DaDianNao, EIE has 2.9$ \times $, 19$ \times $ and 3$ \times $ better throughput, energy efficiency and area efficiency.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{LiKamWa:2016:RAC, author = "Robert LiKamWa and Yunhui Hou and Julian Gao and Mia Polansky and Lin Zhong", title = "{RedEye}: analog {ConvNet} image sensor architecture for continuous mobile vision", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "255--266", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001164", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Continuous mobile vision is limited by the inability to efficiently capture image frames and process vision features. This is largely due to the energy burden of analog readout circuitry, data traffic, and intensive computation. To promote efficiency, we shift early vision processing into the analog domain. This results in RedEye, an analog convolutional image sensor that performs layers of a convolutional neural network in the analog domain before quantization. We design RedEye to mitigate analog design complexity, using a modular column-parallel design to promote physical design reuse and algorithmic cyclic reuse. RedEye uses programmable mechanisms to admit noise for tunable energy reduction. Compared to conventional systems, RedEye reports an 85\% reduction in sensor energy, 73\% reduction in cloudlet-based system energy, and a 45\% reduction in computation-based system energy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Reagen:2016:MEL, author = "Brandon Reagen and Paul Whatmough and Robert Adolf and Saketh Rama and Hyunkwang Lee and Sae Kyu Lee and Jos{\'e} Miguel Hern{\'a}ndez-Lobato and Gu-Yeon Wei and David Brooks", title = "{Minerva}: enabling low-power, highly-accurate deep neural network accelerators", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "267--278", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001165", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The continued success of Deep Neural Networks (DNNs) in classification tasks has sparked a trend of accelerating their execution with specialized hardware. While published designs easily give an order of magnitude improvement over general-purpose hardware, few look beyond an initial implementation. This paper presents Minerva, a highly automated co-design approach across the algorithm, architecture, and circuit levels to optimize DNN hardware accelerators. Compared to an established fixed-point accelerator baseline, we show that fine-grained, heterogeneous datatype optimization reduces power by 1.5$ \times $; aggressive, inline predication and pruning of small activity values further reduces power by 2.0$ \times $; and active hardware fault detection coupled with domain-aware error mitigation eliminates an additional 2.7$ \times $ through lowering SRAM voltages. Across five datasets, these optimizations provide a collective average of 8.1$ \times $ power reduction over an accelerator baseline without compromising DNN model accuracy. Minerva enables highly accurate, ultra-low power DNN accelerators (in the range of tens of milliwatts), making it feasible to deploy DNNs in power-constrained IoT and mobile devices.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Yao:2016:OCO, author = "Yuan Yao and Zhonghai Lu", title = "Opportunistic competition overhead reduction for expediting critical section in {NoC} based {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "279--290", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001167", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With the degree of parallelism increasing, performance of multi-threaded shared variable applications is not only limited by serialized critical section execution, but also by the serialized competition overhead for threads to get access to critical section. As the number of concurrent threads grows, such competition overhead may exceed the time spent in critical section itself, and become the dominating factor limiting the performance of parallel applications. In modern operating systems, queue spinlock, which comprises a low-overhead spinning phase and a high-overhead sleeping phase, is often used to lock critical sections. In the paper, we show that this advanced locking solution may create very high competition overhead for multithreaded applications executing in NoC-based CMPs. Then we propose a software-hardware cooperative mechanism that can opportunistically maximize the chance that a thread wins the critical section access in the low-overhead spinning phase, thereby reducing the competition overhead. At the OS primitives level, we monitor the remaining times of retry (RTR) in a thread's spinning phase, which reflects in how long the thread must enter into the high-overhead sleep mode. At the hardware level, we integrate the RTR information into the packets of locking requests, and let the NoC prioritize locking request packets according to the RTR information. The principle is that the smaller RTR a locking request packet carries, the higher priority it gets and thus quicker delivery. We evaluate our opportunistic competition overhead reduction technique with cycle-accurate full-system simulations in GEM5 using PARSEC (11 programs) and SPEC OMP2012 (14 programs) benchmarks. Compared to the original queue spinlock implementation, experimental results show that our method can effectively increase the opportunity of threads entering the critical section in low-overhead spinning phase, reducing the competition overhead averagely by 39.9\% (maximally by 61.8\%) and accelerating the execution of the Region-of-Interest averagely by 14.4\% (maximally by 24.5\%) across all 25 benchmark programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Kim:2016:SCD, author = "Channoh Kim and Sungmin Kim and Hyeon Gyu Cho and Dooyoung Kim and Jaehyeok Kim and Young H. Oh and Hakbeom Jang and Jae W. Lee", title = "Short-circuit dispatch: accelerating virtual machine interpreters on embedded processors", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "291--303", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001168", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Interpreters are widely used to implement high-level language virtual machines (VMs), especially on resource-constrained embedded platforms. Many scripting languages employ interpreter-based VMs for their advantages over native code compilers, such as portability, smaller resource footprint, and compact codes. For efficient interpretation a script (program) is first compiled into an intermediate representation, or bytecodes. The canonical interpreter then runs an infinite loop that fetches, decodes, and executes one bytecode at a time. This bytecode dispatch loop is a well-known source of inefficiency, typically featuring a large jump table with a hard-to-predict indirect jump. Most existing techniques to optimize this loop focus on reducing the misprediction rate of this indirect jump in both hardware and software. However, these techniques are much less effective on embedded processors with shallow pipelines and low IPCs. Instead, we tackle another source of inefficiency more prominent on embedded platforms--redundant computation in the dispatch loop. To this end, we propose Short-Circuit Dispatch (SCD), a low-cost architectural extension that enables fast, hardware-based bytecode dispatch with fewer instructions. The key idea of SCD is to overlay the software-created bytecode jump table on a branch target buffer (BTB). Once a bytecode is fetched, the BTB is looked up using the bytecode, instead of PC, as key. If it hits, the interpreter directly jumps to the target address retrieved from the BTB; otherwise, it goes through the original dispatch path. This effectively eliminates redundant computation in the dispatcher code for decode, bound check, and target address calculation, thus significantly reducing total instruction count. Our simulation results demonstrate that SCD achieves geomean speedups of 19.9\% and 14.1\% for two production-grade script interpreters for Lua and JavaScript, respectively. Moreover, our fully synthesizable RTL design based on a RISC-V embedded processor shows that SCD improves the EDP of the Lua interpreter by 24.2\%, while increasing the chip area by only 0.72\% at a 40nm technology node.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Dall:2016:AVP, author = "Christoffer Dall and Shih-Wei Li and Jin Tack Lim and Jason Nieh and Georgios Koloventzos", title = "{ARM} virtualization: performance and architectural implications", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "304--316", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001169", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "ARM servers are becoming increasingly common, making server technologies such as virtualization for ARM of growing importance. We present the first study of ARM virtualization performance on server hardware, including multicore measurements of two popular ARM and x86 hypervisors, KVM and Xen. We show how ARM hardware support for virtualization can enable much faster transitions between VMs and the hypervisor, a key hypervisor operation. However, current hypervisor designs, including both Type 1 hypervisors such as Xen and Type 2 hypervisors such as KVM, are not able to leverage this performance benefit for real application workloads. We discuss the reasons why and show that other factors related to hypervisor software design and implementation have a larger role in overall performance. Based on our measurements, we discuss changes to ARM's hardware virtualization support that can potentially bridge the gap to bring its faster VM-to-hypervisor transition mechanism to modern Type 2 hypervisors running real applications. These changes have been incorporated into the latest ARM architecture.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Gaur:2016:BVC, author = "Jayesh Gaur and Alaa R. Alameldeen and Sreenivas Subramoney", title = "Base-victim compression: an opportunistic cache compression architecture", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "317--328", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001171", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The memory wall has motivated many enhancements to cache management policies aimed at reducing misses. Cache compression has been proposed to increase effective cache capacity, which potentially reduces capacity and conflict misses. However, complexity in cache compression implementations could increase cache power and access latency. On the other hand, advanced cache replacement mechanisms use heuristics to reduce misses, leading to significant performance gains. Both cache compression and replacement policies should collaborate to improve performance. In this paper, we demonstrate that cache compression and replacement policies can interact negatively. In many workloads, performance gains from replacement policies are lost due to the need to alter the replacement policy to accommodate compression. This leads to sub-optimal replacement policies that could lose performance compared to an uncompressed cache. We introduce a novel, opportunistic cache compression mechanism, Base-Victim, based on an efficient cache design. Our compression architecture improves performance on top of advanced cache replacement policies, and guarantees a hit rate at least as high as that of an uncompressed cache. For cache-sensitive applications, Base-Victim achieves an average 7.3\% performance gain for single-threaded workloads, and 8.7\% gain for four-thread multi-program workload mixes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Kim:2016:BPC, author = "Jungrae Kim and Michael Sullivan and Esha Choukse and Mattan Erez", title = "Bit-plane compression: transforming data for better compression in many-core architectures", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "329--340", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001172", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As key applications become more data-intensive and the computational throughput of processors increases, the amount of data to be transferred in modern memory subsystems grows. Increasing physical bandwidth to keep up with the demand growth is challenging, however, due to strict area and energy limitations. This paper presents a novel and lightweight compression algorithm, Bit-Plane Compression (BPC), to increase the effective memory bandwidth. BPC aims at homogeneously-typed memory blocks, which are prevalent in many-core architectures, and applies a smart data transformation to both improve the inherent data compressibility and to reduce the complexity of compression hardware. We demonstrate that BPC provides superior compression ratios of 4.1:1 for integer benchmarks and reduces memory bandwidth requirements significantly.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Nair:2016:XEE, author = "Prashant J. Nair and Vilas Sridharan and Moinuddin K. Qureshi", title = "{XED}: exposing on-die error detection information for strong memory reliability", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "341--353", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001174", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Large-granularity memory failures continue to be a critical impediment to system reliability. To make matters worse, as DRAM scales to smaller nodes, the frequency of unreliable bits in DRAM chips continues to increase. To mitigate such scaling-related failures, memory vendors are planning to equip existing DRAM chips with On-Die ECC. For maintaining compatibility with memory standards, On-Die ECC is kept invisible from the memory controller. This paper explores how to design high reliability memory systems in presence of On-Die ECC. We show that if On-Die ECC is not exposed to the memory system, having a 9-chip ECC-DIMM (implementing SECDED) provides almost no reliability benefits compared to an 8-chip non-ECC DIMM. We also show that if the error detection of On-Die ECC can be exposed to the memory controller, then Chipkill-level reliability can be achieved even with a 9-chip ECC-DIMM. To this end, we propose eXposed On-Die Error Detection (XED), which exposes the On-Die error detection information without requiring changes to the memory standards or consuming bandwidth overheads. When the On-Die ECC detects an error, XED transmits a pre-defined ``catch-word'' instead of the corrected data value. On receiving the catch-word, the memory controller uses the parity stored in the 9-chip of the ECC-DIMM to correct the faulty chip (similar to RAID-3). Our studies show that XED provides Chipkill-level reliability (172x higher than SECDED), while incurring negligible overheads, with a 21\% lower execution time than Chipkill. We also show that XED can enable Chipkill systems to provide Double-Chipkill level reliability while avoiding the associated storage, performance, and power overheads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{ulAlam:2016:PRS, author = "Mohammad Mejbah ul Alam and Abdullah Muzahid", title = "Production-run software failure diagnosis via \underline{a}daptive \underline{c}ommunication \underline{t}racking", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "354--366", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001175", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Software failure diagnosis techniques work either by sampling some events at production-run time or by using some bug detection algorithms. Some of the techniques require the failure to be reproduced multiple times. The ones that do not require such, are not adaptive enough when the execution platform, environment or code changes. We propose ACT, a diagnosis technique for production-run failures, that uses the machine intelligence of neural hardware. ACT learns some invariants (e.g., data communication invariants) on-the-fly using the neural hardware and records any potential violation of them. Since ACT can learn invariants on-the-fly, it can adapt to any change in execution setting or code. Since it records only the potentially violated invariants, the postprocessing phase can pinpoint the root cause fairly accurately without requiring to observe the failure again. ACT works seamlessly for many sequential and concurrency bugs. The paper provides a detailed design and implementation of ACT in a typical multiprocessor system. It uses a three stage pipeline for partially configurable one hidden layer neural networks. We have evaluated ACT on a variety of programs from popular benchmarks as well as open source programs. ACT diagnoses failures caused by 16 bugs from these programs with accurate ranking. Compared to existing learning and sampling based approaches, ACT has better diagnostic ability. For the default configuration, ACT has an average execution overhead of 8.2\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Chen:2016:ESA, author = "Yu-Hsin Chen and Joel Emer and Vivienne Sze", title = "{Eyeriss}: a spatial architecture for energy-efficient dataflow for convolutional neural networks", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "367--379", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001177", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Deep convolutional neural networks (CNNs) are widely used in modern AI systems for their superior accuracy but at the cost of high computational complexity. The complexity comes from the need to simultaneously process hundreds of filters and channels in the high-dimensional convolutions, which involve a significant amount of data movement. Although highly-parallel compute paradigms, such as SIMD/SIMT, effectively address the computation requirement to achieve high throughput, energy consumption still remains high as data movement can be more expensive than computation. Accordingly, finding a dataflow that supports parallel processing with minimal data movement cost is crucial to achieving energy-efficient CNN processing without compromising accuracy. In this paper, we present a novel dataflow, called row-stationary (RS), that minimizes data movement energy consumption on a spatial architecture. This is realized by exploiting local data reuse of filter weights and feature map pixels, i.e., activations, in the high-dimensional convolutions, and minimizing data movement of partial sum accumulations. Unlike dataflows used in existing designs, which only reduce certain types of data movement, the proposed RS dataflow can adapt to different CNN shape configurations and reduces all types of data movement through maximally utilizing the processing engine (PE) local storage, direct inter-PE communication and spatial parallelism. To evaluate the energy efficiency of the different dataflows, we propose an analysis framework that compares energy cost under the same hardware area and processing parallelism constraints. Experiments using the CNN configurations of AlexNet show that the proposed RS dataflow is more energy efficient than existing dataflows in both convolutional (1.4$ \times $ to 2.5$ \times $) and fully-connected layers (at least 1.3$ \times $ for batch size larger than 16). The RS dataflow has also been demonstrated on a fabricated chip, which verifies our energy analysis.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Kim:2016:NPD, author = "Duckhwan Kim and Jaeha Kung and Sek Chai and Sudhakar Yalamanchili and Saibal Mukhopadhyay", title = "{Neurocube}: a programmable digital neuromorphic architecture with high-density {$3$D} memory", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "380--392", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001178", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents a programmable and scalable digital neuromorphic architecture based on 3D high-density memory integrated with logic tier for efficient neural computing. The proposed architecture consists of clusters of processing engines, connected by 2D mesh network as a processing tier, which is integrated in 3D with multiple tiers of DRAM. The PE clusters access multiple memory channels (vaults) in parallel. The operating principle, referred to as the memory centric computing, embeds specialized state-machines within the vault controllers of HMC to drive data into the PE clusters. The paper presents the basic architecture of the Neurocube and an analysis of the logic tier synthesized in 28nm and 15nm process technologies. The performance of the Neurocube is evaluated and illustrated through the mapping of a Convolutional Neural Network and estimating the subsequent power and performance for both training and inference.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Liu:2016:CIS, author = "Shaoli Liu and Zidong Du and Jinhua Tao and Dong Han and Tao Luo and Yuan Xie and Yunji Chen and Tianshi Chen", title = "{Cambricon}: an instruction set architecture for neural networks", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "393--405", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001179", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Neural Networks (NN) are a family of models for a broad range of emerging machine learning and pattern recondition applications. NN techniques are conventionally executed on general-purpose processors (such as CPU and GPGPU), which are usually not energy-efficient since they invest excessive hardware resources to flexibly support various workloads. Consequently, application-specific hardware accelerators for neural networks have been proposed recently to improve the energy-efficiency. However, such accelerators were designed for a small set of NN techniques sharing similar computational patterns, and they adopt complex and informative instructions (control signals) directly corresponding to high-level functional blocks of an NN (such as layers), or even an NN as a whole. Although straightforward and easy-to-implement for a limited set of similar NN techniques, the lack of agility in the instruction set prevents such accelerator designs from supporting a variety of different NN techniques with sufficient flexibility and efficiency. In this paper, we propose a novel domain-specific Instruction Set Architecture (ISA) for NN accelerators, called Cambricon, which is a load-store architecture that integrates scalar, vector, matrix, logical, data transfer, and control instructions, based on a comprehensive analysis of existing NN techniques. Our evaluation over a total of ten representative yet distinct NN techniques have demonstrated that Cambricon exhibits strong descriptive capacity over a broad range of NN techniques, and provides higher code density than general-purpose ISAs such as $ \times $86, MIPS, and GPGPU. Compared to the latest state-of-the-art NN accelerator design DaDianNao [5] (which can only accommodate 3 types of NN techniques), our Cambricon-based accelerator prototype implemented in TSMC 65nm technology incurs only negligible latency/power/area overheads, with a versatile coverage of 10 different NN benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Huang:2016:DLN, author = "Ziqiang Huang and Andrew D. Hilton and Benjamin C. Lee", title = "Decoupling loads for nano-instruction set computers", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "406--417", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001181", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We propose an ISA extension that decouples the data access and register write operations in a load instruction. We describe system and hardware support for decoupled loads. Furthermore, we show how compilers can generate better static instruction schedules by hoisting a decoupled load's data access above may-alias stores and branches. We find that decoupled loads improve performance with geometric mean speedups of 8.4\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Hayes:2016:FVM, author = "Timothy Hayes and Oscar Palomar and Osman Unsal and Adrian Cristal and Mateo Valero", title = "Future vector microprocessor extensions for data aggregations", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "418--430", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001182", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As the rate of annual data generation grows exponentially, there is a demand to aggregate and summarise vast amounts of information quickly. In the past, frequency scaling was relied upon to push application throughput. Today, Dennard scaling has ceased and further performance must come from exploiting parallelism. Single instruction-multiple data (SIMD) instruction sets offer a highly efficient and scalable way of exploiting data-level parallelism (DLP). While microprocessors originally offered very simple SIMD support targeted at multimedia applications, these extensions have been growing both in width and functionality. Observing this trend, we use a simulation framework to model future SIMD support and then propose and evaluate five different ways of vectorising data aggregation. We find that although data aggregation is abundant in DLP, it is often too irregular to be expressed efficiently using typical SIMD instructions. Based on this observation, we propose a set of novel algorithms and SIMD instructions to better capture this irregular DLP. Furthermore, we discover that the best algorithm is highly dependent on the characteristics of the input. Our proposed solution can dynamically choose the optimal algorithm in the majority of cases and achieves speedups between 2.7 $ \times $ and 7.6 $ \times $ over a scalar baseline.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Sleiman:2016:ESO, author = "Faissal M. Sleiman and Thomas F. Wenisch", title = "Efficiently scaling out-of-order cores for simultaneous multithreading", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "431--443", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001183", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Simultaneous multithreading (SMT) out-of-order cores waste a significant portion of structural out-of-order core resources on instructions that do not need them. These resources eliminate false ordering dependences. However, because thread interleaving spreads dependent instructions, nearly half of instructions dynamically issue in program order after all false dependences have resolved. These in-sequence instructions interleave with other reordered instructions at a fine granularity within the instruction window. We develop a technique to efficiently scale in-flight instructions through a hybrid out-of-order/in-order microarchitecture, which can dispatch instructions to efficient in-order scheduling mechanisms---using a FIFO issue queue called the shelf ---on an instruction-by-instruction basis. Instructions dispatched to the shelf do not allocate out-of-order core resources in the reorder buffer, issue queue, physical registers, or load-store queues. We measure opportunity for such hybrid microarchitectures and design and evaluate a practical dispatch mechanism targeted at 4-threaded cores. Adding a shelf to a baseline 4-thread system with 64-entry ROB improves normalized system throughput by 11.5\% (up to 19.2\% at best) and energy-delay product by 10.9\% (up to 17.5\% at best).", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Hashemi:2016:ADC, author = "Milad Hashemi and Khubaib and Eiman Ebrahimi and Onur Mutlu and Yale N. Patt", title = "Accelerating dependent cache misses with an enhanced memory controller", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "444--455", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001184", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "On-chip contention increases memory access latency for multicore processors. We identify that this additional latency has a substantial effect on performance for an important class of latency-critical memory operations: those that result in a cache miss and are dependent on data from a prior cache miss. We observe that the number of instructions between the first cache miss and its dependent cache miss is usually small. To minimize dependent cache miss latency, we propose adding just enough functionality to dynamically identify these instructions at the core and migrate them to the memory controller for execution as soon as source data arrives from DRAM. This migration allows memory requests issued by our new Enhanced Memory Controller (EMC) to experience a 20\% lower latency than if issued by the core. On a set of memory intensive quad-core workloads, the EMC results in a 13\% improvement in system performance and a 5\% reduction in energy consumption over a system with a Global History Bufer prefetcher, the highest performing prefetcher in our evaluation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Zhang:2016:TAS, author = "Yunqi Zhang and David Meisner and Jason Mars and Lingjia Tang", title = "{Treadmill}: attributing the source of tail latency through precise load testing and statistical inference", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "456--468", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001186", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Managing tail latency of requests has become one of the primary challenges for large-scale Internet services. Data centers are quickly evolving and service operators frequently desire to make changes to the deployed software and production hardware configurations. Such changes demand a confident understanding of the impact on one's service, in particular its effect on tail latency (e.g., 95th- or 99th-percentile response latency of the service). Evaluating the impact on the tail is challenging because of its inherent variability. Existing tools and methodologies for measuring these effects suffer from a number of deficiencies including poor load tester design, statistically inaccurate aggregation, and improper attribution of effects. As shown in the paper, these pitfalls can often result in misleading conclusions. In this paper, we develop a methodology for statistically rigorous performance evaluation and performance factor attribution for server workloads. First, we find that careful design of the server load tester can ensure high quality performance evaluation, and empirically demonstrate the inaccuracy of load testers in previous work. Learning from the design flaws in prior work, we design and develop a modular load tester platform, Treadmill, that overcomes pitfalls of existing tools. Next, utilizing Treadmill, we construct measurement and analysis procedures that can properly attribute performance factors. We rely on statistically-sound performance evaluation and quantile regression, extending it to accommodate the idiosyncrasies of server systems. Finally, we use our augmented methodology to evaluate the impact of common server hardware features with Facebook production workloads on production hardware. We decompose the effects of these features on request tail latency and demonstrate that our evaluation methodology provides superior results, particularly in capturing complicated and counter-intuitive performance behaviors. By tuning the hardware features as suggested by the attribution, we reduce the 99th-percentile latency by 43\% and its variance by 93\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Wu:2016:DFD, author = "Qiang Wu and Qingyuan Deng and Lakshmi Ganesh and Chang-Hong Hsu and Yun Jin and Sanjeev Kumar and Bin Li and Justin Meza and Yee Jiun Song", title = "{Dynamo}: facebook's data center-wide power management system", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "469--480", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001187", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Data center power is a scarce resource that often goes underutilized due to conservative planning. This is because the penalty for overloading the data center power delivery hierarchy and tripping a circuit breaker is very high, potentially causing long service outages. Recently, dynamic server power capping, which limits the amount of power consumed by a server, has been proposed and studied as a way to reduce this penalty, enabling more aggressive utilization of provisioned data center power. However, no real at-scale solution for data center-wide power monitoring and control has been presented in the literature. In this paper, we describe Dynamo --- a data center-wide power management system that monitors the entire power hierarchy and makes coordinated control decisions to safely and efficiently use provisioned data center power. Dynamo has been developed and deployed across all of Facebook's data centers for the past three years. Our key insight is that in real-world data centers, different power and performance constraints at different levels in the power hierarchy necessitate coordinated data center-wide power management. We make three main contributions. First, to understand the design space of Dynamo, we provide a characterization of power variation in data centers running a diverse set of modern workloads. This characterization uses fine-grained power samples from tens of thousands of servers and spanning a period of over six months. Second, we present the detailed design of Dynamo. Our design addresses several key issues not addressed by previous simulation-based studies. Third, the proposed techniques and design have been deployed and evaluated in large scale data centers serving billions of users. We present production results showing that Dynamo has prevented 18 potential power outages in the past 6 months due to unexpected power surges; that Dynamo enables optimizations leading to a 13\% performance boost for a production Hadoop cluster and a nearly 40\% performance increase for a search cluster; and that Dynamo has already enabled an 8\% increase in the power capacity utilization of one of our data centers with more aggressive power subscription measures underway.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Wong:2016:PEA, author = "Daniel Wong", title = "Peak efficiency aware scheduling for highly energy proportional servers", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "481--492", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001188", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Energy proportionality of data center severs have improved drastically over the past decade to the point where near ideal energy proportional servers are now common. These highly energy proportional servers exhibit the unique property where peak efficiency no longer coincides with peak utilization. In this paper, we explore the implications of this property on data center scheduling. We identified that current state of the art data center schedulers does not efficiently leverage these properties, leading to inefficient scheduling decisions. We propose Peak Efficiency Aware Scheduling (PEAS) which can achieve better-than-ideal energy proportionality at the data center level. We demonstrate that PEAS can reduce average power by 25.5\% with 3.0\% improvement to TCO compared to state-of-the-art scheduling policies.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Li:2016:PAD, author = "Chao Li and Zhenhua Wang and Xiaofeng Hou and Haopeng Chen and Xiaoyao Liang and Minyi Guo", title = "Power attack defense: securing battery-backed data centers", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "493--505", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001189", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Battery systems are crucial components for mission-critical data centers. Without secure energy backup, existing under-provisioned data centers are largely unguarded targets for cyber criminals. Particularly for today's scale-out servers, power oversubscription unavoidably taxes a data center's backup energy resources, leaving very little room for dealing with emergency. Besides, the emerging trend towards deploying distributed energy storage architecture causes the associated energy backup of each rack to shrink, making servers vulnerable to power anomalies. As a result, an attacker can generate power peaks to easily crash or disrupt a power-constrained system. This study aims at securing data centers from malicious loads that seek to drain their precious energy storage and overload server racks without prior detection. We term such load as Power Virus (PV) and demonstrate its basic two-phase attacking model and characterize its behaviors on real systems. The PV can learn the victim rack's battery characteristics by disguising as benign loads. Once gaining enough information, the PV can be mutated to generate hidden power spikes that have a high chance to overload the system. To defend against PV, we propose power attack defense (PAD), a novel energy management patch built on lightweight software and hardware mechanisms. PAD not only increases the attacking cost considerably by hiding vulnerable racks from visible spikes, it also strengthens the last line of defense against hidden spikes. Using Google cluster traces we show that PAD can effectively raise the bar of a successful power attack: compared to prior arts, it increases the data center survival time by 1.6~11X and provides better performance guarantee. It enables modern data centers to safely exploit the benefits that power oversubscription may provide, with the slightest cost overhead.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Gao:2016:DLP, author = "Mingyu Gao and Christina Delimitrou and Dimin Niu and Krishna T. Malladi and Hongzhong Zheng and Bob Brennan and Christos Kozyrakis", title = "{DRAF}: a low-power {DRAM}-based reconfigurable acceleration fabric", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "506--518", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001191", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "FPGAs are a popular target for application-specific accelerators because they lead to a good balance between flexibility and energy efficiency. However, FPGA lookup tables introduce significant area and power overheads, making it difficult to use FPGA devices in environments with tight cost and power constraints. This is the case for datacenter servers, where a modestly-sized FPGA cannot accommodate the large number of diverse accelerators that datacenter applications need. This paper introduces DRAF, an architecture for bit-level reconfigurable logic that uses DRAM subarrays to implement dense lookup tables. DRAF overlaps DRAM operations like bitline precharge and charge restoration with routing within the reconfigurable routing fabric to minimize the impact of DRAM latency. It also supports multiple configuration contexts that can be used to quickly switch between different accelerators with minimal latency. Overall, DRAF trades off some of the performance of FPGAs for significant gains in area and power. DRAF improves area density by 10x over FPGAs and power consumption by more than 3x, enabling DRAF to satisfy demanding applications within strict power and cost constraints. While accelerators mapped to DRAF are 2-3x slower than those in FPGAs, they still deliver a 13x speedup and an 11x reduction in power consumption over a Xeon core for a wide range of datacenter tasks, including analytics and interactive services like speech recognition.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Zhang:2016:MWE, author = "Lunkai Zhang and Brian Neely and Diana Franklin and Dmitri Strukov and Yuan Xie and Frederic T. Chong", title = "{Mellow Writes}: extending lifetime in resistive memories through selective slow write backs", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "519--531", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001192", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Emerging resistive memory technologies, such as PCRAM and ReRAM, have been proposed as promising replacements for DRAM-based main memory, due to their better scalability, low standby power, and non-volatility. However, limited write endurance is a major drawback for such resistive memory technologies. Wear leveling (balancing the distribution of writes) and wear limiting (reducing the number of writes) have been proposed to mitigate this disadvantage, but both techniques only manage a fixed budget of writes to a memory system rather than increase the number available. In this paper, we propose a new type of wear limiting technique, Mellow Writes, which reduces the wearout of individual writes rather than reducing the number of writes. Mellow Writes is based on the fact that slow writes performed with lower dissipated power can lead to longer endurance (and therefore longer lifetimes). For non-volatile memories, an N$^1$ to N$^3$ times endurance can be achieved if the write operation is slowed down by N times. We present three microarchitectural mechanisms ( Bank-Aware Mellow Writes, Eager Mellow Writes, and Wear Quota ) that selectively perform slow writes to increase memory lifetime while minimizing performance impact. Assuming a factor N$^2$ advantage in cell endurance for a factor N slower write, our best Mellow Writes mechanism can achieve 2.58$ \times $ lifetime and 1.06$ \times $ performance of the baseline system. In addition, its performance is almost the same as a system aggressively optimized for performance (at the expense of endurance). Finally, Wear Quota guarantees a minimal lifetime (e.g., 8 years) by forcing more slow writes in presence of heavy workloads. We also perform sensitivity analysis on the endurance advantage factor for slow writes, from N$^1$ to N$^3$, and find that our technique is still useful for factors as low as N$^1$.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Zhou:2016:MMI, author = "Yanqi Zhou and David Wentzlaff", title = "{MITTS}: memory inter-arrival time traffic shaping", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "532--544", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001193", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Memory bandwidth severely limits the scalability and performance of multicore and manycore systems. Application performance can be very sensitive to both the delivered memory bandwidth and latency. In multicore systems, a memory channel is usually shared by multiple cores. Having the ability to precisely provision, schedule, and isolate memory bandwidth and latency on a per-core basis is particularly important when different memory guarantees are needed on a per-customer, per-application, or per-core basis. Infrastructure as a Service (IaaS) Cloud systems, and even general purpose multicores optimized for application throughput or fairness all benefit from the ability to control and schedule memory access on a fine-grain basis. In this paper, we propose MITTS (Memory Inter-arrival Time Traffic Shaping), a simple, distributed hardware mechanism which limits memory traffic at the source (Core or LLC). MITTS shapes memory traffic based on memory request inter-arrival time, enabling fine-grain bandwidth allocation. In an IaaS system, MITTS enables Cloud customers to express their memory distribution needs and pay commensurately. For instance, MITTS enables charging customers that have bursty memory traffic more than customers with uniform memory traffic for the same aggregate bandwidth. Beyond IaaS systems, MITTS can also be used to optimize for throughput or fairness in a general purpose multi-program workload. MITTS uses an online genetic algorithm to configure hardware bins, which can adapt for program phases and variable input sets. We have implemented MITTS in Verilog and have taped-out the design in a 25-core 32nm processor and find that MITTS requires less than 0.9\% of core area. We evaluate across SPECint, PARSEC, Apache, and bhm Mail Server workloads, and find that MITTS achieves an average 1.18$ \times $ performance gain compared to the best static bandwidth allocation, a 2.69$ \times $ average performance/cost advantage in an IaaS setting, and up to 1.17$ \times $ better throughput and 1.52$ \times $ better fairness when compared to conventional memory bandwidth provisioning techniques.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{SanMiguel:2016:AA, author = "Joshua {San Miguel} and Natalie Enright Jerger", title = "The anytime automaton", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "545--557", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001195", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Approximate computing is an emerging paradigm enabling tradeoffs between accuracy and efficiency. However, a fundamental challenge persists: state-of-the-art techniques lack the ability to enforce runtime guarantees on accuracy. The convention is to (1) employ offline or online accuracy models, or (2) present experimental results that demonstrate empirically low error. Unfortunately, these approaches are still unable to guarantee acceptability of all application outputs at runtime. We offer a solution that revisits concepts from anytime algorithms. Originally explored for real-time decision problems, anytime algorithms have the property of producing results with increasing accuracy over time. We propose the Anytime Automaton, a new computation model that executes applications as a parallel pipeline of anytime approximations. An automaton produces approximate versions of the application output with increasing accuracy, guaranteeing that the final precise version is eventually reached. The automaton can be stopped whenever the output is deemed acceptable; otherwise, it is a simple matter of letting it run longer. We present an in-depth analysis of the model and demonstrate attractive runtime-accuracy profiles on various applications. Our anytime automaton is the first step towards systems where the acceptability of an application's output directly governs the amount of time and energy expended.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Wang:2016:AMR, author = "Siyang Wang and Xiangyu Zhang and Yuxuan Li and Ramin Bashizade and Song Yang and Chris Dwyer and Alvin R. Lebeck", title = "Accelerating {Markov} random field inference using molecular optical {Gibbs} sampling units", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "558--569", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001196", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The increasing use of probabilistic algorithms from statistics and machine learning for data analytics presents new challenges and opportunities for the design of computing systems. One important class of probabilistic machine learning algorithms is Markov Chain Monte Carlo (MCMC) sampling, which can be used on a wide variety of applications in Bayesian Inference. However, this probabilistic iterative algorithm can be inefficient in practice on today's processors, especially for problems with high dimensionality and complex structure. The source of inefficiency is generating samples from parameterized probability distributions. This paper seeks to address this sampling inefficiency and presents a new approach to support probabilistic computing that leverages the native randomness of Resonance Energy Transfer (RET) networks to construct RET-based sampling units (RSU). Although RSUs can be designed for a variety of applications, we focus on the specific class of probabilistic problems described as Markov Random Field Inference. Our proposed RSU uses a RET network to implement a molecular-scale optical Gibbs sampling unit (RSU-G) that can be integrated into a processor / GPU as specialized functional units or organized as a discrete accelerator. We experimentally demonstrate the fundamental operation of an RSU using a macro-scale hardware prototype. Emulation-based evaluation of two computer vision applications for HD images reveal that an RSU augmented GPU provides speedups over a GPU of 3 and 16. Analytic evaluation shows a discrete accelerator that is limited by 336 GB/s DRAM produces speedups of 21 and 54 versus the GPU implementations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Huang:2016:EAA, author = "Yipeng Huang and Ning Guo and Mingoo Seok and Yannis Tsividis and Simha Sethumadhavan", title = "Evaluation of an analog accelerator for linear algebra", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "570--582", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001197", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Due to the end of supply voltage scaling and the increasing percentage of dark silicon in modern integrated circuits, researchers are looking for new scalable ways to get useful computation from existing silicon technology. In this paper we present a reconfigurable analog accelerator for solving systems of linear equations. Commonly perceived downsides of analog computing, such as low precision and accuracy, limited problem sizes, and difficulty in programming are all compensated for using methods we discuss. Based on a prototyped analog accelerator chip we compare the performance and energy consumption of the analog solver against an efficient digital algorithm running on a CPU, and find that the analog accelerator approach may be an order of magnitude faster and provide one third energy savings, depending on the accelerator design. Due to the speed and efficiency of linear algebra algorithms running on digital computers, an analog accelerator that matches digital performance needs a large silicon footprint. Finally, we conclude that problem classes outside of systems of linear equations may hold more promise for analog acceleration.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Wang:2016:LLA, author = "Jin Wang and Norm Rubin and Albert Sidelnik and Sudhakar Yalamanchili", title = "{LaPerm}: locality aware scheduler for dynamic parallelism on {GPUs}", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "583--595", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001199", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Recent developments in GPU execution models and architectures have introduced dynamic parallelism to facilitate the execution of irregular applications where control flow and memory behavior can be unstructured, time-varying, and hierarchical. The changes brought about by this extension to the traditional bulk synchronous parallel (BSP) model also creates new challenges in exploiting the current GPU memory hierarchy. One of the major challenges is that the reference locality that exists between the parent and child thread blocks (TBs) created during dynamic nested kernel and thread block launches cannot be fully leveraged using the current TB scheduling strategies. These strategies were designed for the current implementations of the BSP model but fall short when dynamic parallelism is introduced since they are oblivious to the hierarchical reference locality. We propose LaPerm, a new locality-aware TB scheduler that exploits such parent-child locality, both spatial and temporal. LaPerm adopts three different scheduling decisions to (i) prioritize the execution of the child TBs, (ii) bind them to the stream multiprocessors (SMXs) occupied by their parents TBs, and (iii) maintain workload balance across compute units. Experiments with a set of irregular CUDA applications executed on a cycle-level simulator employing dynamic parallelism demonstrate that LaPerm is able to achieve an average of 27\% performance improvement over the baseline round-robin TB scheduler commonly used in modern GPUs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Shahar:2016:ACS, author = "Sagi Shahar and Shai Bergman and Mark Silberstein", title = "{ActivePointers}: a case for software address translation on {GPUs}", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "596--608", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001200", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Modern discrete GPUs have been the processors of choice for accelerating compute-intensive applications, but using them in large-scale data processing is extremely challenging. Unfortunately, they do not provide important I/O abstractions long established in the CPU context, such as memory mapped files, which shield programmers from the complexity of buffer and I/O device management. However, implementing these abstractions on GPUs poses a problem: the limited GPU virtual memory system provides no address space management and page fault handling mechanisms to GPU developers, and does not allow modifications to memory mappings for running GPU programs. We implement ActivePointers, a software address translation layer and paging system that introduces native support for page faults and virtual address space management to GPU programs, and enables the implementation of fully functional memory mapped files on commodity GPUs. Files mapped into GPU memory are accessed using active pointers, which behave like regular pointers but access the GPU page cache under the hood, and trigger page faults which are handled on the GPU. We design and evaluate a number of novel mechanisms, including a translation cache in hardware registers and translation aggregation for deadlock-free page fault handling of threads in a single warp. We extensively evaluate ActivePointers on commodity NVIDIA GPUs using microbenchmarks, and also implement a complex image processing application that constructs a photo collage from a subset of 10 million images stored in a 40GB file. The GPU implementation maps the entire file into GPU memory and accesses it via active pointers. The use of active pointers adds only up to 1\% to the application's runtime, while enabling speedups of up to 3.9$ \times $ over a combined CPU+GPU implementation and 2.6$ \times $ over a 12-core CPU-only implementation which uses AVX vector instructions.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Yoon:2016:VTM, author = "Myung Kuk Yoon and Keunsoo Kim and Sangpil Lee and Won Woo Ro and Murali Annavaram", title = "Virtual thread: maximizing thread-level parallelism beyond {GPU} scheduling limit", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "609--621", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001201", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Modern GPUs require tens of thousands of concurrent threads to fully utilize the massive amount of processing resources. However, thread concurrency in GPUs can be diminished either due to shortage of thread scheduling structures (scheduling limit), such as available program counters and single instruction multiple thread stacks, or due to shortage of on-chip memory (capacity limit), such as register file and shared memory. Our evaluations show that in practice concurrency in many general purpose applications running on GPUs is curtailed by the scheduling limit rather than the capacity limit. Maximizing the utilization of on-chip memory resources without unduly increasing the scheduling complexity is a key goal of this paper. This paper proposes a Virtual Thread (VT) architecture which assigns Cooperative Thread Arrays (CTAs) up to the capacity limit, while ignoring the scheduling limit. However, to reduce the logic complexity of managing more threads concurrently, we propose to place CTAs into active and inactive states, such that the number of active CTAs still respects the scheduling limit. When all the warps in an active CTA hit a long latency stall, the active CTA is context switched out and the next ready CTA takes its place. We exploit the fact that both active and inactive CTAs still fit within the capacity limit which obviates the need to save and restore large amounts of CTA state. Thus VT significantly reduces performance penalties of CTA swapping. By swapping between active and inactive states, VT can exploit higher degree of thread level parallelism without increasing logic complexity. Our simulation results show that VT improves performance by 23.9\% on average.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Kim:2016:AIE, author = "Jungrae Kim and Michael Sullivan and Sangkug Lym and Mattan Erez", title = "All-inclusive {ECC}: thorough end-to-end protection for reliable computer memory", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "622--633", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001203", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Increasing transfer rates and decreasing I/O voltage levels make signals more vulnerable to transmission errors. While the data in computer memory are well-protected by modern error checking and correcting (ECC) codes, the clock, control, command, and address (CCCA) signals are weakly protected or even unprotected such that transmission errors leave serious gaps in data-only protection. This paper presents All-Inclusive ECC (AIECC), a memory protection scheme that leverages and augments data ECC to also thoroughly protect CCCA signals. AIECC provides strong end-to-end protection of memory, detecting nearly 100\% of CCCA errors and also preventing transmission errors from causing latent memory data corruption. AIECC provides these system-level benefits without requiring extra storage and transfer overheads and without degrading the effective level of data protection.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Duwe:2016:RUF, author = "Henry Duwe and Xun Jian and Daniel Petrisko and Rakesh Kumar", title = "Rescuing uncorrectable fault patterns in on-chip memories through error pattern transformation", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "634--644", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001204", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Voltage scaling can effectively reduce processor power, but also reduces the reliability of the SRAM cells in on-chip memories. Therefore, it is often accompanied by the use of an error correcting code (ECC). To enable reliable and efficient memory operation at low voltages, ECCs for on-chip memories must provide both high error coverage and low correction latency. In this paper, we propose error pattern transformation, a novel low-latency error correction technique that allows on-chip memories to be scaled to voltages lower than what has been previously possible. Our technique relies on the observation that the number of on-chip memory errors that many ECCs can correct differs widely depending on the error patterns in the logical words they protect. We propose adaptively rearranging the logical bit to physical bit mapping per word according to the BIST-detectable fault pattern in the physical word. The adaptive logical bit to physical bit mapping transforms many uncorrectable error patterns in the logical words into correctable error patterns and, therefore, improving on-chip memory reliability. This reduces the minimum voltage at which on-chip memory can run by 70mV over the best low-latency ECC baseline, leading to a 25.7\% core-wide power reduction for an ARM Cortex-A7-like core. Energy per instruction is reduced by 15.7\% compared to the best baseline.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Kim:2016:RMR, author = "Dong Wan Kim and Mattan Erez", title = "{RelaxFault} memory repair", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "645--657", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001205", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Memory system reliability is a serious concern in many systems today, and is becoming more worrisome as technology scales and system size grows. Stronger fault tolerance capability is therefore desirable, but often comes at high cost. In this paper, we propose a low-cost, fault-aware, hardware-only resilience mechanism, RelaxFault, that repairs the vast majority of memory faults using a small amount of the LLC to remap faulty memory locations. RelaxFault requires less than 100KiB of LLC capacity, has near-zero impact on performance and power. By repairing faults, RelaxFault relaxes the requirement for high fault tolerance of other mechanisms, such as ECC. A better tradeoff between resilience and overhead is made by exploiting an understanding of memory system architecture and fault characteristics. We show that RelaxFault provides better repair capability than prior work of similar cost, improves memory reliability to a greater extent, and significantly reduces the number of maintenance events and memory module replacements. We also propose a more refined memory fault model than prior work and demonstrate its importance.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Pothukuchi:2016:UMI, author = "Raghavendra Pradyumna Pothukuchi and Amin Ansari and Petros Voulgaris and Josep Torrellas", title = "Using multiple input, multiple output formal control to maximize resource efficiency in architectures", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "658--670", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001207", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As processors seek more resource efficiency, they increasingly need to target multiple goals at the same time, such as a level of performance, power consumption, and average utilization. Robust control solutions cannot come from heuristic-based controllers or even from formal approaches that combine multiple single-parameter controllers. Such controllers may end-up working against each other. What is needed is control-theoretical MIMO (multiple input, multiple output) controllers, which actuate on multiple inputs and control multiple outputs in a coordinated manner. In this paper, we use MIMO control-theory techniques to develop controllers to dynamically tune architectural parameters in processors. To our knowledge, this is the first work in this area. We discuss three ways in which a MIMO controller can be used. We develop an example of MIMO controller and show that it is substantially more effective than controllers based on heuristics or built by combining single-parameter formal controllers. The general approach discussed here is likely to be increasingly relevant as future processors become more resource-constrained and adaptive.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Cherupalli:2016:EDT, author = "Hari Cherupalli and Rakesh Kumar and John Sartori", title = "Exploiting dynamic timing slack for energy efficiency in ultra-low-power embedded systems", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "671--681", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001208", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Many emerging applications such as the internet of things, wearables, and sensor networks have ultra-low-power requirements. At the same time, cost and programmability considerations dictate that many of these applications will be powered by general purpose embedded microprocessors and microcontrollers, not ASICs. In this paper, we exploit a new opportunity for improving energy efficiency in ultra-low-power processors expected to drive these applications --- dynamic timing slack. Dynamic timing slack exists when an embedded software application executed on a processor does not exercise the processor's static critical paths. In such scenarios, the longest path exercised by the application has additional timing slack which can be exploited for power savings at no performance cost by scaling down the processor's voltage at the same frequency until the longest exercised paths just meet timing constraints. Paths that cannot be exercised by an application can safely be allowed to violate timing constraints. We show that dynamic timing slack exists for many ultra-low-power applications and that exploiting dynamic timing slack can result in significant power savings for many ultra-low-power processors. We also present an automated methodology for identifying dynamic timing slack and selecting a safe operating point for a processor and a particular embedded software. Our approach for identifying and exploiting dynamic timing slack is non-speculative, requires no programmer intervention and little or no hardware support, and demonstrates potential power savings of up to 32\%, 25\% on average, over a range of embedded applications running on a common ultra-low-power processor, at no performance cost.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Zhou:2016:CSI, author = "Yanqi Zhou and Henry Hoffmann and David Wentzlaff", title = "{CASH}: supporting {IaaS} customers with a sub-core configurable architecture", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "682--694", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001209", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Infrastructure as a Service (IaaS) Clouds have grown increasingly important. Recent architecture designs support IaaS providers through fine-grain configurability, allowing providers to orchestrate low-level resource usage. Little work, however, has been devoted to supporting IaaS customers who must determine how to use such fine-grain configurable resources to meet quality-of-service (QoS) requirements while minimizing cost. This is a difficult problem because the multiplicity of configurations creates a non-convex optimization space. In addition, this optimization space may change as customer applications enter and exit distinct processing phases. In this paper, we overcome these issues by proposing CASH: a fine-grain configurable architecture co-designed with a cost-optimizing runtime system. The hardware architecture enables configurability at the granularity of individual ALUs and L2 cache banks and provides unique interfaces to support low-overhead, dynamic configuration and monitoring. The runtime uses a combination of control theory and machine learning to configure the architecture such that QoS requirements are met and cost is minimized. Our results demonstrate that the combination of fine-grain configurability and non-convex optimization provides tremendous cost savings (70\% savings) compared to coarse-grain heterogeneity and heuristic optimization. In addition, the system is able to customize configurations to particular applications, respond to application phases, and provide near optimal cost for QoS targets.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Arjomand:2016:BAP, author = "Mohammad Arjomand and Mahmut T. Kandemir and Anand Sivasubramaniam and Chita R. Das", title = "Boosting access parallelism to {PCM}-based main memory", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "695--706", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001211", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Despite its promise as a DRAM main memory replacement, Phase Change Memory (PCM) has high write latencies which can be a serious detriment to its widespread adoption. Apart from slowing down a write request, the consequent high latency can also keep other chips of the same rank, that are not involved in this write, idle for long times. There are several practical considerations that make it difficult to allow subsequent reads and/or writes to be served concurrently from the same chips during the long latency write. This paper proposes and evaluates several novel mechanisms --- re-constructing data from error correction bits instead of waiting for chips currently busy to serve a read, rotating word mappings across chips of a PCM rank, and rotating the mapping of error detection/correction bits across these chips --- to overlap several reads with an ongoing write (RoW) and even a write with an ongoing write (WoW). The paper also presents the necessary micro-architectural enhancements needed to implement these mechanisms, without significantly changing the current interfaces. The resulting PCM access parallelism (PCMap) system incorporating these enhancements, boosts the intra-rank-level parallelism during such writes from a very low baseline value of 2.4 to an average and maximum values of 4.5 and 7.4, respectively (out of a maximum of 8.0), across a wide spectrum of both multiprogrammed and multithreaded workloads. This boost in parallelism results in an average IPC improvement of 15.6\% and 16.7\% for the multiprogrammed and multithreaded workloads, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Gandhi:2016:APE, author = "Jayneel Gandhi and Mark D. Hill and Michael M. Swift", title = "Agile paging: exceeding the best of nested and shadow paging", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "707--718", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001212", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Virtualization provides benefits for many workloads, but the overheads of virtualizing memory are not universally low. The cost comes from managing two levels of address translation---one in the guest virtual machine (VM) and the other in the host virtual machine monitor (VMM)---with either nested or shadow paging. Nested paging directly performs a two-level page walk that makes TLB misses slower than unvirtualized native, but enables fast page tables changes. Alternatively, shadow paging restores native TLB miss speeds, but requires costly VMM intervention on page table updates. This paper proposes agile paging that combines both techniques and exceeds the best of both. A virtualized page walk starts with shadow paging and optionally switches in the same page walk to nested paging where frequent page table updates would cause costly VMM interventions. Agile paging enables most TLB misses to be handled as fast as native while most page table changes avoid VMM intervention. It requires modest changes to hardware (e.g., demark when to switch) and VMM policies (e.g., predict good switching opportunities). We emulate the proposed hardware and prototype the software in Linux with KVM on x86-64. Agile paging performs more than 12\% better than the best of the two techniques and comes within 4\% of native execution for all workloads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Seol:2016:EED, author = "Hoseok Seol and Wongyu Shin and Jaemin Jang and Jungwhan Choi and Jinwoong Suh and Lee-Sup Kim", title = "Energy efficient data encoding in {DRAM} channels exploiting data value similarity", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "719--730", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001213", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As DRAM data bandwidth increases, tremendous energy is dissipated in the DRAM data bus. To reduce the energy consumed in the data bus, DRAM interfaces with asymmetric termination, such as Pseudo Open Drain (POD) and Low Voltage Swing Terminated Logic (LVSTL), have been adopted in modern DRAMs. In interfaces using asymmetric termination, the amount of termination energy is proportional to the hamming weight of the data words. In this work, we propose Bitwise Difference Encoding (BD-Encoding), which decreases the hamming weight of data words, leading to a reduction in energy consumption in the modern DRAM data bus. Since smaller hamming weight of the data words also reduces switching activity, switching energy and power noise are also both reduced. BD-Encoding exploits the similarity in data words in the DRAM data bus. We observed that similar data words (i.e. data words whose hamming distance is small) are highly likely to be sent over at similar times. Based on this observation, BD-coder stores the data recently sent over in both the memory controller and DRAMs. Then, BD-coder transfers the bitwise difference between the current data and the most similar data. In an evaluation using SPEC 2006, BD-Encoding using 64 recent data reduced termination energy by 58.3\% and switching energy by 45.3\%. In addition, 55\% of the LdI/dt noise was decreased with BD-Encoding.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ISCA '16 conference proceedings.", } @Article{Sheng:2016:CCF, author = "Jiayi Sheng and Qingqing Xiong and Chen Yang and Martin C. Herbordt", title = "Collective Communication on {FPGA} Clusters with Static Scheduling", journal = j-COMP-ARCH-NEWS, volume = "44", number = "4", pages = "2--7", month = sep, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3039902.3039904", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:57 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "FPGA-centric clouds and clusters provide direct and programmable interconnects with obvious benefits for communication latency and bandwidth. One rarely studied aspect of DPI is that they facilitate application-aware routing: if communication patterns are static and known a priori, as is usually the case, then judicious routing can reduce congestion, latency, and the hardware required. In this study we explore applying the method of offline/static routing to collective operations, in particular, multicast and reduction. An entirely new communication infrastructure is proposed and implemented, including switch design and routing algorithm. A substantial improvement in performance is obtained, especially for multicast. We believe that this is one of the few general offline/static routing solutions for real HPC clusters, and FPGA-centric clusters in particular.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '16 conference proceedings.", } @Article{Mashimo:2016:CEH, author = "Susumu Mashimo and Thiem Van Chu and Kenji Kise", title = "Cost-Effective and High-Throughput Merge Network: Architecture for the Fastest {FPGA} Sorting Accelerator", journal = j-COMP-ARCH-NEWS, volume = "44", number = "4", pages = "8--13", month = sep, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3039902.3039905", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:57 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "High-performance sorting is used in various areas such as database transactions and genomic feature operations. To improve sorting performance, in addition to the conventional approach of using general purpose processors or GPUs, the approach of using FPGAs is becoming a promising solution. As an FPGA sorting accelerator, Casper and Olukotun have recently proposed the fastest one known so far. In their study, they proposed a merge network which can merge two sorted data series at a throughput of 6 data elements per 200MHz clock cycle. If an FPGA sorting accelerator is constructed using merge networks, the overall throughput will be mainly determined by the throughputs of the merge networks. This motivates us to design a merge network which outputs more than 6 data elements per 200MHz clock cycle. In this paper, we propose a cost-effective and high-throughput merge network for the fastest FPGA sorting accelerator. The evaluation shows that our proposal achieves a throughput of 8 data elements per 200MHz clock cycle.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '16 conference proceedings.", } @Article{Pham-Quoc:2016:FBM, author = "Cuong Pham-Quoc and Biet Nguyen and Tran Ngoc Thinh", title = "{FPGA}-based Multicore Architecture for Integrating Multiple {DDoS} Defense Mechanisms", journal = j-COMP-ARCH-NEWS, volume = "44", number = "4", pages = "14--19", month = sep, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3039902.3039906", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:57 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper proposes an FPGA-based multicore architecture to integrate multiple DDoS defense mechanisms for DDoS protection. The architecture allows multiple cooperating DDoS mitigation techniques to classify incoming network packets. The proposed architecture consists of two separate partitions static and dynamic. The static partition includes packet pre-processing and post-processing modules while the DDoS filtering techniques are implemented within the dynamic partition. These filtering techniques can be implemented by either hardware custom computing cores or general purpose soft processors or both. In all cases, these DDoS filtering computing cores can be updated or changed at runtime or design time. We implement our first prototype system with the Hop-count filtering and Ingress/Engress filtering techniques using the Xilinx Virtex 5 xc5vtx240t FPGA device. The synthesis results show that the system can work at up to 116.782MHz while utilizing about 41\% LUTs, 47\% Registers, and 53\% Block Memory of the available hardware resources. Experimental results show that our system achieves a 100\% detection rate (true positive) with a 0\% false negative rate and the maximum 0.74\% false positive rate. Moreover, the prototype system obtains packet processing throughput by up to 9.869 Gbps in half-duplex mode and 19.738 Gbps in full-duplex mode.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '16 conference proceedings.", } @Article{Eslami:2016:IOM, author = "Fatemeh Eslami and Steven J. E. Wilton", title = "An Improved Overlay and Mapping Algorithm Supporting Rapid Triggering for {FPGA} Debug", journal = j-COMP-ARCH-NEWS, volume = "44", number = "4", pages = "20--25", month = sep, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3039902.3039907", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:57 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Embedded system designers can benefit from FPGA accelerators to achieve higher performance and efficiency. However, there are challenges that do not exist in software development; using software simulators to validate large and complex hardware designs can be extremely slow and impractical. Debugging designs implemented on an FPGA enables running the design at speed for long runs and more exhaustive test cases. However, limited observability is the primary challenge in hardware debug. To enhance hardware observability, trace-buffers and a trigger circuitry are inserted into the design. During the device operation, a history of signals of interest is recorded into the trace-buffers for off-line debug and validation. Recompiling the design every time the designer wishes to modify the trigger condition results in long debug turn-around times and reduced productivity. In this work, we present a pre-synthesized overlay fabric and algorithm to enable rapid triggering; during debug turn-around, TriggerPlus, a greedy algorithm, is used to implement a trigger circuit on the overlay. TriggerPlus is fast and simple, yet still capable of mapping the trigger circuit to the overlay fabric. We evaluate our techniques using VPR, showing that using our overlay and mapping algorithm together is at least an order of magnitude faster than the previous work resulting in a significant reduction in debug turn-around times.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '16 conference proceedings.", } @Article{Kobayashi:2016:HSV, author = "Ryohei Kobayashi and Tomohiro Misono and Kenji Kise", title = "A High-speed {Verilog} {HDL} Simulation Method using a Lightweight Translator", journal = j-COMP-ARCH-NEWS, volume = "44", number = "4", pages = "26--31", month = sep, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3039902.3039908", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:57 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Designing with Hardware Description Languages (HDLs) is still the de facto standard way to develop FPGA-based custom computing systems, and RTL simulation is an important step in ensuring that the designed hardware behavior meets the design specification. In this paper, we propose a new high-speed Verilog HDL simulation method. It is based on two previously proposed techniques: ArchHDL and Pyverilog. ArchHDL is used as a simulation engine in the method because the RTL simulation provided by ArchHDL can be parallelized with OpenMP. We use Pyverilog to develop a code translator to convert Verilog HDL source code into ArchHDL code, and due to this, the translator can be realized and its implementation is lightweight. We compare the proposed method with Synopsys VCS, and the experimental results show that the RTL simulation behavior and speed are same as that of Synopsys VCS and up to 5.8x better respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '16 conference proceedings.", } @Article{Sassa:2016:FSP, author = "Shohei Sassa and Kenji Kanazawa and Shaowei Cai and Moritoshi Yasunaga", title = "An {FPGA} Solver for Partial {MaxSAT} Problems Based on Stochastic Local Search", journal = j-COMP-ARCH-NEWS, volume = "44", number = "4", pages = "32--37", month = sep, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3039902.3039909", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:57 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper, we propose an FPGA solver for partial maximum satisfiability (PMS) problems based on the Dist algorithm, which is one of the best performing stochastic local search algorithms for PMS problems. The Dist algorithm searches for a truth assignment for the variables that satisfies all of the hard clauses and as many soft clauses as possible by iteratively selecting a variable using a heuristic and flipping its truth value. During each iteration, new candidate variables for flipping are generated and existing ones may disappear. In our solver, the variables that may become new candidates for flipping are evaluated by parallel and pipeline processing, and then only the variables that actually become the candidates for flipping are extracted and gathered up in concurrent with the pipeline processing. The extraction process is not influenced by the number of the new candidates or their random generation, which minimizes the disturbance of the parallel and pipeline processing. Our FPGA solver can solve large PMS problems up to 7.74 times faster than running Dist on CPU.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '16 conference proceedings.", } @Article{Houtgast:2016:EGI, author = "Ernst Joachim Houtgast and VladMihai Sima and Koen Bertels and Zaid AlArs", title = "An Efficient {GPUAccelerated} Implementation of Genomic Short Read Mapping with {BWAMEM}", journal = j-COMP-ARCH-NEWS, volume = "44", number = "4", pages = "38--43", month = sep, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3039902.3039910", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:57 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Next Generation Sequencing techniques have resulted in an exponential growth in the generation of genetics data, the amount of which will soon rival, if not overtake, other Big Data fields, such as astronomy and streaming video services. To become useful, this data requires processing by a complex pipeline of algorithms, taking multiple days even on large clusters. The mapping stage of such genomics pipelines, which maps the short reads onto a reference genome, takes up a significant portion of execution time. BWA-MEM is the de-facto industry-standard for the mapping stage. Here, a GPU-accelerated implementation of BWA-MEM is proposed. The Seed Extension phase, one of the three main BWA-MEM algorithm phases that requires between 30\%-50\% of overall processing time, is offloaded onto the GPU. A thorough design space analysis is presented for an optimized mapping of this phase onto the GPU. The resulting systolic-array based implementation obtains a two-fold overall application-level speedup, which is the maximum theoretically achievable speedup. Moreover, this speedup is sustained for systems with up to twenty-two logical cores. Based on the findings, a number of suggestions are made to improve GPU architecture, resulting in potentially greatly increased performance for bioinformatics-class algorithms.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '16 conference proceedings.", } @Article{Nakahara:2016:FCS, author = "Hiroki Nakahara and Hiroyuki Nakanishi and Kazumasa Iwai and Tsutomu Sasao", title = "An {FFT} Circuit for a Spectrometer of a Radio Telescope using the Nested {RNS} including the Constant Division", journal = j-COMP-ARCH-NEWS, volume = "44", number = "4", pages = "44--49", month = sep, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3039902.3039911", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:57 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A radio telescope analyzes radio frequency (RF) received from celestial objects. It consists of an antenna, a receiver, and a spectrometer. The spectrometer converts the time domain into the frequency domain by an FFT operation. This paper applies an FFT circuit based on nested residue number system (NRNS), which recursively decompose the RNS. It can decompose the MAC unit into circuits with small sizes. In the FFT using the NRNS, a MAC unit is decomposed into 4-bit ones realized by look-up tables of the FPGA. Also, to realize the scaling (truncation) circuit, we propose a constant division algorithm on the FPGA. The truncation is realized by the division of a dynamic range for a subset of moduli. We implemented the proposed NRNS FFT on the Xilinx Inc. Virtex 6 FPGA. Compared with a Xilinx Inc. binary FFT library, although the number of block RAMs (BRAMs) was increased by 38\%, in the RNS FFT, the number of LUTs was decreased by 42-45\% and the maximum clock frequency was increased by 38-74\%. With this technique, we successfully implemented an FFT that satisfied the required size and speed specifications on an available FPGA, since the excessive number of LUTs was the bottleneck of the binary FFT.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '16 conference proceedings.", } @Article{Pangracious:2016:NTD, author = "Vinod Pangracious and Mulhim Al-Doori", title = "Novel Three-Dimensional Embedded {FPGA} Technology and Achitecture", journal = j-COMP-ARCH-NEWS, volume = "44", number = "4", pages = "50--55", month = sep, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3039902.3039912", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:57 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In this paper we present a high density three-dimensional (3D) interconnect network implementation based on a modified Mesh-of-Trees (MoT) topology for an embedded FPGA architecture design targeted for high performance 3D integration. To obtain the optimal MoT-based interconnect structure, the routing architecture of the 2D MoT-based FPGA is modified to include long routing segments that span multiple switch blocks in every row and column. By adjusting the percentage of long wire and span, a 2.5D or 3D high density MoT-based embedded FPGAs can be designed. For the 3D multi-stacked MoT-based FPGAs, the 2D MoTbased FPGA is sliced into two or more equal sections by adjusting the length of the long wire span. The long wire segments are realized using 3D through silicon via (TSVs) and 2.5D interposer-based multi-FPGAs, we increase the number of cuts and apply appropriate optimization models to scale down the number of long wires and horizontal inter-FPGA interposer wires. Using our 2.5/3D CAD models, we demonstrate the speed and area of 3D MoT-based FPGA architecture improved by 54\% and 41\% respectively in comparison to 3D Mesh-based FPGAs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '16 conference proceedings.", } @Article{Knodel:2016:MLR, author = "Oliver Knodel and Paul R. Genssler and Rainer G. Spallek", title = "Migration of long-running Tasks between Reconfigurable Resources using Virtualization", journal = j-COMP-ARCH-NEWS, volume = "44", number = "4", pages = "56--61", month = sep, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3039902.3039913", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:57 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Computing performance and scalability are the essential basics in modern data centres. Field Programmable Gate Arrays (FPGAs) provide a promising opportunity to improve performance, security and energy efficiency. Especially background acceleration of computationally complex and long-running tasks is an important field of application. A flexible use of reconfigurable devices within a cloud context requires an abstraction of the actual hardware through virtualization. In this paper we present an approach inspired by paravirtualized machines for the integration of reconfigurable hardware into cloud services. Using partial reconfiguration our hardware and software framework virtualizes a single physical FPGA to enable multiple independent user designs. Essential components are the management of those virtual user-defined accelerators (vFPGA) and their migration between physical FPGAs to achieve higher system-wide utilization. The migration requires saving and restoring the internal state or context of the vFPGA. We demonstrate the application possibilities and the resource trade-off of our approach by transferring a running design from one physical FPGA to another. Moreover, we present future perspectives for the use of FPGAs in cloud-based environments.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '16 conference proceedings.", } @Article{Tada:2016:ESG, author = "Jubee Tada and Maiki Hosokawa and Ryusuke Egawa and Hiroaki Kobayashi", title = "Effects of Stacking Granularity on {$3$-D} Stacked Floating-point Fused Multiply Add Units", journal = j-COMP-ARCH-NEWS, volume = "44", number = "4", pages = "62--67", month = sep, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3039902.3039914", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:57 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Three-dimensional stacked integrated circuits (3D-SICs) have been expected to overcome the limitations of conventional two-dimensional (2-D) implemented circuits. Since a stacking strategy affects the performance and the power consumption of 3D-SICs, this paper examines two stacking strategies for designing the 3-D stacked floating-point fused multiply-add (FP-FMA) module which contains four FP-FMA units. Experimental results show that a coarse-grain stacking strategy is suitable for reducing critical path delay of the 3-D stacked FP-FMA module. On the other hand, a fine-grain stacking strategy is suitable for reducing power consumption. The 3-D stacked FP-FMA module which is designed based on a fine-grain stacking strategy achieves an 8.4\% critical path delay reduction and an 18\% average power reduction compared with the 2-D implementation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", keywords = "fused multiply-add (FMA) instruction", remark = "HEART '16 conference proceedings.", } @Article{Su:2016:NNB, author = "Jiang Su and Jianxiong Liu and David B. Thomas and Peter Y. K. Cheung", title = "Neural Network Based Reinforcement Learning Acceleration on {FPGA} Platforms", journal = j-COMP-ARCH-NEWS, volume = "44", number = "4", pages = "68--73", month = sep, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3039902.3039915", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:57 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Deep Q-learning (DQN) is a recently proposed reinforcement learning algorithm where a neural network is applied as a non-linear approximator to its value function. The exploitation-exploration mechanism allows the training and prediction of the NN to execute simultaneously in an agent during its interaction with the environment. Agents often act independently on battery power, so the training and prediction must occur within the agent and on a limited power budget. In this work, We propose an FPGA acceleration system design for Neural Network Q-learning (NNQL). Our proposed system has high flexibility due to the support to run-time network parameterization, which allows neuroevolution algorithms to dynamically restructure the network to achieve better learning results. Additionally, the power consumption of our proposed system is adaptive to the network size because of a new processing element design. Based on our test cases on networks with hidden layer size ranging from 32 to 16384, our proposed system achieves 7x to 346x speedup compared to GPU implementation and 22x to 77x speedup to hand-coded CPU counterpart.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '16 conference proceedings.", } @Article{DHollander:2016:HLS, author = "Erik H. D'Hollander", title = "High-Level Synthesis Optimization for Blocked Floating-Point Matrix Multiplication", journal = j-COMP-ARCH-NEWS, volume = "44", number = "4", pages = "74--79", month = sep, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3039902.3039916", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:57 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In the last decade floating-point matrix multiplication on FPGAs has been studied extensively and efficient architectures as well as detailed performance models have been developed. By design these IP cores take a fixed footprint which not necessarily optimizes the use of all available resources. Moreover, the low-level architectures are not easily amenable to a parameterized synthesis. In this paper high-level synthesis is used to fine-tune the configuration parameters in order to achieve the highest performance with maximal resource utilization. An\ exploration strategy is presented to optimize the use of critical resources (DSPs, memory) for any given FPGA. To account for the limited memory size on the FPGA, a block-oriented matrix multiplication is organized such that the block summation is done on the CPU while the block multiplication occurs on the logic fabric simultaneously. The communication overhead between the CPU and the FPGA is minimized by streaming the blocks in a Gray code ordering scheme which maximizes the data reuse for consecutive block matrix product calculations. Using highlevel synthesis optimization, the programmable logic operates at 93\% of the theoretical peak performance and the combined CPU-FPGA design achieves 76\% of the available hardware processing speed for the floating-point multiplication of 2K by 2K matrices.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '16 conference proceedings.", } @Article{Li:2016:FBV, author = "Chengzhe Li and Lai Yoong Yee and Hiroshi Maruyama and Yoshiki Yamaguchi", title = "{FPGA}-based Volleyball Player Tracker", journal = j-COMP-ARCH-NEWS, volume = "44", number = "4", pages = "80--86", month = sep, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3039902.3039917", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:57 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The significant challenge facing sport science is how to grasp the flow of the game and analyze the situation of a match. The use of information technology will facilitate to achieve the goal. The technical issues from the practical application perspective can be classified into three main points: computation speed, system size and complex data analysis considering the accuracy. In this paper, for accelerating image recognition and object tracking, we propose a one-dimensional data pipeline architecture on a field-programmable gate array (FPGA). It satisfies both of high-speed streaming computation and small-sized circuits by considering spatiotemporal data dependence. Volleyball games have been chosen as a target application. The proposed system will identify the position of six volleyball players within real time. The design on an FPGA includes pre-processing, color filtering, digitalization, noise reduction, template matching, and so on. The design was implemented and evaluated on Atlys Spartan-6 FPGA Trainer Board with one XILINX Spartan-6 LX45 FPGA. The computational performance achieves 100 frames per second at SVGA 800 by 600 pixel resolution. And our design has good scalability; the performance can easily be enhanced when the larger FPGA is used. The proposed system is also compact, which is composed of one Atlys board and one Atlys VmodCAM stereo-camera board. The average-accuracy rates of pregame situation and during a match are 87.1\% and 65.7\%, respectively. Since the input is streaming data, we can improve the accuracy by considering the previous and the next frames. They could be improved to 90.4\% and 72.2\%, respectively, when we adopt template matching with a moving average filter.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '16 conference proceedings.", } @Article{Zhao:2016:SHC, author = "Qian Zhao and Motoki Amagasaki and Masahiro Iida and Morihiro Kuga and Toshinori Sueyoshi", title = "A Study of Heterogeneous Computing Design Method based on Virtualization Technology", journal = j-COMP-ARCH-NEWS, volume = "44", number = "4", pages = "86--91", month = sep, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3039902.3039918", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:57 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "One challenge for the heterogeneous computing with the FPGA is how to bridge the development gap between SW and HW designs. The high level synthesis (HLS) technique allows producing hardware with high level languages like C. Design tools based on the HLS like Xilinx SDSoC and SDAccel are developed to speedup SW/HW co-designs. However, the developers still require much circuit design skills to use these tools more efficiently. In this paper, we propose a heterogeneous computing platform based on the virtualization technology, namely hCODE.With the help of the virtualization, the HW and SW design can be totally separated. This brings multiple benefits like accelerating a program without modifying or recompiling it, enable high portability and scalability across different HW and operating system.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '16 conference proceedings.", } @Article{Lin:2016:FHL, author = "Colin Yu Lin and Zhenghong Jiang and Cheng Fu and Hayden Kwok-Hay So and Haigang Yang", title = "{FPGA} High-level Synthesis versus Overlay: Comparisons on Computation Kernels", journal = j-COMP-ARCH-NEWS, volume = "44", number = "4", pages = "92--97", month = sep, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3039902.3039919", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:57 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "To promote FPGA to a wider user community and to increase design productivity, two new design methodologies, namely FPGA high-level synthesis (HLS) and FPGA overlay, are presented to use a high-level design abstraction. To make clear distinguish features of each design methodology, we make an comparison of a state-of-the-art FPGA HLS tool, Vivado HLS, and an FPGA overlay tool, ArchSyn, on two computation intensive kernels, matrix-matrix multiplication and fast Fourier transform. In the comparison, FPGA overlay shows an overwhelming superiority in computation performance, which is 8X to 39X faster than FPGA HLS. However, FPGA HLS exhibits its advantages in dynamic power consumption metric. It achieves up to 17X lower power consumption than FPGA overlay. Power- and energy-efficiency are another two essential metrics evaluating trade-offs between performance and power consumption. As demonstrated with evaluation results, FPGA overlay is averagely 3.5X better in power-efficiency for FFT kernel, and achieves up to 2 orders of magnitude better energy-efficiency than FPGA HLS.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "HEART '16 conference proceedings.", } @Article{Zhan:2016:PMB, author = "Xusheng Zhan and Yungang Bao and Christian Bienia and Kai Li", title = "{PARSEC3.0}: a Multicore Benchmark Suite with Network Stacks and {SPLASH-2X}", journal = j-COMP-ARCH-NEWS, volume = "44", number = "5", pages = "1--16", month = dec, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3053277.3053279", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Benchmarks play a very important role in accelerating the development and research of CMP. As one of them, the PARSEC suite continues to be updated and revised over and over again so that it can offer better support for researchers. The former versions of PARSEC have enough workloads to evaluate the property of CMP about CPU, cache and memory, but it lacks of applications based on network stack to assess the performance of CMPs in respect of network. In this work, we introduce PARSEC3.0, a new version of PARSEC suite that implements a user-level network stack and generates three network workloads with this stack to cover network domain. We explore the input sets of splash-2 and expand them to multiple scales, a.k.a, splash-2x. We integrate splash-2 and splash-2x into PARSEC framework so that researchers use these benchmark suite conveniently. Finally, we evaluate the u-TCP/IP stack and new network workloads, and analyze the characterizes of splash-2 and splash-2x", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:2017:BDA, author = "Yunji Chen", title = "Big Data Analytics and Intelligence at Alibaba Cloud", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "1--1", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037699", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As China's largest cloud service provider, Alibaba Cloud has been one of the fastest growing cloud computing platforms in the world. In this talk, I-ll present an overview of Big Data and AI computing platform at Alibaba Cloud, which consists of a wide range of products and services to enable fast and efficient big data development and intelligent analysis. The underlying computing infrastructure supports a variety of computation scenarios, including batch, interactive, stream, and graph computation, as well as large-scale machine learning on heterogeneous cloud-scale data centers. Several big data products, such as rule-based engine, recommendation system, BI tools, etc., are provided to address different business needs. The platform not only supports Alibaba's internal businesses but also provides solid services to enterprise customers. In addition, I'll describe key techniques and system internals, and outline outstanding research and engineering challenges.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Cherupalli:2017:DAS, author = "Hari Cherupalli and Henry Duwe and Weidong Ye and Rakesh Kumar and John Sartori", title = "Determining Application-specific Peak Power and Energy Requirements for Ultra-low Power Processors", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "3--16", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037711", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Many emerging applications such as IoT, wearables, implantables, and sensor networks are power- and energy-constrained. These applications rely on ultra-low-power processors that have rapidly become the most abundant type of processor manufactured today. In the ultra-low-power embedded systems used by these applications, peak power and energy requirements are the primary factors that determine critical system characteristics, such as size, weight, cost, and lifetime. While the power and energy requirements of these systems tend to be application-specific, conventional techniques for rating peak power and energy cannot accurately bound the power and energy requirements of an application running on a processor, leading to over-provisioning that increases system size and weight. In this paper, we present an automated technique that performs hardware-software co-analysis of the application and ultra-low-power processor in an embedded system to determine application-specific peak power and energy requirements. Our technique provides more accurate, tighter bounds than conventional techniques for determining peak power and energy requirements, reporting 15\% lower peak power and 17\% lower peak energy, on average, than a conventional approach based on profiling and guardbanding. Compared to an aggressive stressmark-based approach, our technique reports power and energy bounds that are 26\% and 26\% lower, respectively, on average. Also, unlike conventional approaches, our technique reports guaranteed bounds on peak power and energy independent of an application's input set. Tighter bounds on peak power and energy can be exploited to reduce system size, weight, and cost.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Chen:2017:PPQ, author = "Quan Chen and Hailong Yang and Minyi Guo and Ram Srivatsa Kannan and Jason Mars and Lingjia Tang", title = "{Prophet}: Precise {QoS} Prediction on Non-Preemptive Accelerators to Improve Utilization in Warehouse-Scale Computers", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "17--32", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037700", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Guaranteeing Quality-of-Service (QoS) of latency-sensitive applications while improving server utilization through application co-location is important yet challenging in modern datacenters. The key challenge is that when applications are co-located on a server, performance interference due to resource contention can be detrimental to the application QoS. Although prior work has proposed techniques to identify ``safe'' co-locations where application QoS is satisfied by predicting the performance interference on multicores, no such prediction technique on accelerators such as GPUs. In this work, we present Prophet, an approach to precisely predict the performance degradation of latency-sensitive applications on accelerators due to application co-location. We analyzed the performance interference on accelerators through a real system investigation and found that unlike on multicores where the key contentious resources are shared caches and main memory bandwidth, the key contentious resources on accelerators are instead processing elements, accelerator memory bandwidth and PCIe bandwidth. Based on this observation, we designed interference models that enable the precise prediction for processing element, accelerator memory bandwidth and PCIe bandwidth contention on real hardware. By using a novel technique to forecast solo-run execution traces of the co-located applications using interference models, Prophet can accurately predict the performance degradation of latency-sensitive applications on non-preemptive accelerators. Using Prophet, we can identify ``safe'' co-locations on accelerators to improve utilization without violating the QoS target. Our evaluation shows that Prophet can predict the performance degradation with an average prediction error 5.47\% on real systems. Meanwhile, based on the prediction, Prophet achieves accelerator utilization improvements of 49.9\% on average while maintaining the QoS target of latency-sensitive applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Kanev:2017:MAM, author = "Svilen Kanev and Sam Likun Xi and Gu-Yeon Wei and David Brooks", title = "{Mallacc}: Accelerating Memory Allocation", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "33--45", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037736", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Recent work shows that dynamic memory allocation consumes nearly 7\% of all cycles in Google datacenters. With the trend towards increased specialization of hardware, we propose Mallacc, an in-core hardware accelerator designed for broad use across a number of high-performance, modern memory allocators. The design of Mallacc is quite different from traditional throughput-oriented hardware accelerators. Because memory allocation requests tend to be very frequent, fast, and interspersed inside other application code, accelerators must be optimized for latency rather than throughput and area overheads must be kept to a bare minimum. Mallacc accelerates the three primary operations of a typical memory allocation request: size class computation, retrieval of a free memory block, and sampling of memory usage. Our results show that malloc latency can be reduced by up to 50\% with a hardware cost of less than 1500 um2 of silicon area, less than 0.006\% of a typical high-performance processor core.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Wen:2017:REV, author = "Shasha Wen and Milind Chabbi and Xu Liu", title = "{REDSPY}: Exploring Value Locality in Software", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "47--61", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037729", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Complex code bases with several layers of abstractions have abundant inefficiencies that affect the execution time. Value redundancy is a kind of inefficiency where the same values are repeatedly computed, stored, or retrieved over the course of execution. Not all redundancies can be easily detected or eliminated with compiler optimization passes due to the inherent limitations of the static analysis. Microscopic observation of whole executions at instruction- and operand-level granularity breaks down abstractions and helps recognize redundancies that masquerade in complex programs. We have developed REDSPY---a fine-grained profiler to pinpoint and quantify redundant operations in program executions. Value redundancy may happen over time at same locations or in adjacent locations, and thus it has temporal and spatial locality. REDSPY identifies both temporal and spatial value locality. Furthermore, REDSPY is capable of identifying values that are approximately the same, enabling optimization opportunities in HPC codes that often use floating point computations. REDSPY provides intuitive optimization guidance by apportioning redundancies to their provenance---source lines and execution calling contexts. REDSPY pinpointed dramatically high volume of redundancies in programs that were optimization targets for decades, such as SPEC CPU2006 suite, Rodinia benchmark, and NWChem---a production computational chemistry code. Guided by REDSPY, we were able to eliminate redundancies that resulted in significant speedups.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Bhattacharjee:2017:TTP, author = "Abhishek Bhattacharjee", title = "Translation-Triggered Prefetching", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "63--76", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037705", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We propose translation-enabled memory prefetching optimizations or TEMPO, a low-overhead hardware mechanism to boost memory performance by exploiting the operating system's (OS) virtual memory subsystem. We are the first to make the following observations: (1) a substantial fraction (20-40\%) of DRAM references in modern big-data workloads are devoted to accessing page tables; and (2) when memory references require page table lookups in DRAM, the vast majority of them (98\%+) also look up DRAM for the subsequent data access. TEMPO exploits these observations to enable DRAM row-buffer and on-chip cache prefetching of the data that page tables point to. TEMPO requires trivial changes to the memory controller (under 3\% additional area), no OS or application changes, and improves performance by 10-30\% and energy by 1-14\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Kim:2017:TAA, author = "Channoh Kim and Jaehyeok Kim and Sungmin Kim and Dooyoung Kim and Namho Kim and Gitae Na and Young H. Oh and Hyeon Gyu Cho and Jae W. Lee", title = "Typed Architectures: Architectural Support for Lightweight Scripting", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "77--90", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037726", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Dynamic scripting languages are becoming more and more widely adopted not only for fast prototyping but also for developing production-grade applications. They provide high-productivity programming environments featuring high levels of abstraction with powerful built-in functions, automatic memory management, object-oriented programming paradigm and dynamic typing. However, their flexible, dynamic type systems easily become the source of inefficiency in terms of instruction count, memory footprint, and energy consumption. This overhead makes it challenging to deploy these high-productivity programming technologies on emerging single-board computers for IoT applications. Addressing this challenge, this paper introduces Typed Architectures, a high-efficiency, low-cost execution substrate for dynamic scripting languages, where each data variable retains high-level type information at an ISA level. Typed Architectures calculate and check the dynamic type of each variable implicitly in hardware, rather than explicitly in software, hence significantly reducing instruction count for dynamic type checking. Besides, Typed Architectures introduce polymorphic instructions (e.g., xadd), which are bound to the correct native instruction at runtime within the pipeline (e.g., add or fadd) to efficiently implement polymorphic operators. Finally, Typed Architectures provide hardware support for flexible yet efficient type tag extraction and insertion, capturing common data layout patterns of tag-value pairs. Our evaluation using a fully synthesizable RISC-V RTL design on FPGA shows that Typed Architectures achieve geomean speedups of 11.2\% and 9.9\% with maximum speedups of 32.6\% and 43.5\% for two production-grade scripting engines for JavaScript and Lua, respectively. Moreover, Typed Architectures improve the energy-delay product (EDP) by 19.3\% for JavaScript and 16.5\% for Lua with an area overhead of 1.6\% at a 40nm technology node.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Seo:2017:FAS, author = "Jihye Seo and Wook-Hee Kim and Woongki Baek and Beomseok Nam and Sam H. Noh", title = "Failure-Atomic Slotted Paging for Persistent Memory", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "91--104", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037737", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The slotted-page structure is a database page format commonly used for managing variable-length records. In this work, we develop a novel ``failure-atomic slotted page structure'' for persistent memory that leverages byte addressability and durability of persistent memory to minimize redundant write operations used to maintain consistency in traditional database systems. Failure-atomic slotted paging consists of two key elements: (i) in-place commit per page using hardware transactional memory and (ii) slot header logging that logs the commit mark of each page. The proposed scheme is implemented in SQLite and compared against NVWAL, the current state-of-the-art scheme. Our performance study shows that our failure-atomic slotted paging shows optimal performance for database transactions that insert a single record. For transactions that touch more than one database page, our proposed slot-header logging scheme minimizes the logging overhead by avoiding duplicating pages and logging only the metadata of the dirty pages. Overall, we find that our failure-atomic slotted-page management scheme reduces database logging overhead to 1/6 and improves query response time by up to 33\% compared to NVWAL.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Nguyen:2017:WSP, author = "Donald Nguyen and Keshav Pingali", title = "What Scalable Programs Need from Transactional Memory", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "105--118", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037750", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Transactional memory (TM) has been the focus of numerous studies, and it is supported in processors such as the IBM Blue Gene/Q and Intel Haswell. Many studies have used the STAMP benchmark suite to evaluate their designs. However, the speedups obtained for the STAMP benchmarks on all TM systems we know of are quite limited; for example, with 64 threads on the IBM Blue Gene/Q, we observe a median speedup of 1.4X using the Blue Gene/Q hardware transactional memory (HTM), and a median speedup of 4.1X using a software transactional memory (STM). What limits the performance of these benchmarks on TMs? In this paper, we argue that the problem lies with the programming model and data structures used to write them. To make this point, we articulate two principles that we believe must be embodied in any scalable program and argue that STAMP programs violate both of them. By modifying the STAMP programs to satisfy both principles, we produce a new set of programs that we call the Stampede suite. Its median speedup on the Blue Gene/Q is 8.0X when using an STM. The two principles also permit us to simplify the TM design. Using this new STM with the Stampede benchmarks, we obtain a median speedup of 17.7X with 64 threads on the Blue Gene/Q and 13.2X with 32 threads on an Intel Westmere system. These results suggest that HTM and STM designs will benefit if more attention is paid to the division of labor between application programs, systems software, and hardware.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Trippel:2017:TMM, author = "Caroline Trippel and Yatin A. Manerkar and Daniel Lustig and Michael Pellauer and Margaret Martonosi", title = "{TriCheck}: Memory Model Verification at the Trisection of Software, Hardware, and {ISA}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "119--133", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037719", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Memory consistency models (MCMs) which govern inter-module interactions in a shared memory system, are a significant, yet often under-appreciated, aspect of system design. MCMs are defined at the various layers of the hardware-software stack, requiring thoroughly verified specifications, compilers, and implementations at the interfaces between layers. Current verification techniques evaluate segments of the system stack in isolation, such as proving compiler mappings from a high-level language (HLL) to an ISA or proving validity of a microarchitectural implementation of an ISA. This paper makes a case for full-stack MCM verification and provides a toolflow, TriCheck, capable of verifying that the HLL, compiler, ISA, and implementation collectively uphold MCM requirements. The work showcases TriCheck's ability to evaluate a proposed ISA MCM in order to ensure that each layer and each mapping is correct and complete. Specifically, we apply TriCheck to the open source RISC-V ISA [55], seeking to verify accurate, efficient, and legal compilations from C11. We uncover under-specifications and potential inefficiencies in the current RISC-V ISA documentation and identify possible solutions for each. As an example, we find that a RISC-V-compliant microarchitecture allows 144 outcomes forbidden by C11 to be observed out of 1,701 litmus tests examined. Overall, this paper demonstrates the necessity of full-stack verification for detecting MCM-related bugs in the hardware-software stack.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Nalli:2017:APM, author = "Sanketh Nalli and Swapnil Haria and Mark D. Hill and Michael M. Swift and Haris Volos and Kimberly Keeton", title = "An Analysis of Persistent Memory Use with {WHISPER}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "135--148", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037730", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Emerging non-volatile memory (NVM) technologies promise durability with read and write latencies comparable to volatile memory (DRAM). We define Persistent Memory (PM) as NVM accessed with byte addressability at low latency via normal memory instructions. Persistent-memory applications ensure the consistency of persistent data by inserting ordering points between writes to PM allowing the construction of higher-level transaction mechanisms. An epoch is a set of writes to PM between ordering points. To put systems research in PM on a firmer footing, we developed and analyzed a PM benchmark suite called WHISPER (Wisconsin-HP Labs Suite for Persistence) that comprises ten PM applications we gathered to cover all current interfaces to PM. A quantitative analysis reveals several insights: (a) only 4\% of writes in PM-aware applications are to PM and the rest are to volatile memory, (b) software transactions are often implemented with 5 to 50 ordering points (c) 75\% of epochs update exactly one 64B cache line, (d) 80\% of epochs from the same thread depend on previous epochs from the same thread, while few epochs depend on epochs from other threads. Based on our analysis, we propose the Hands-off Persistence System (HOPS) to track updates to PM in hardware. Current hardware design requires applications to force data to PM as each epoch ends. HOPS provides high-level ISA primitives for applications to express durability and ordering constraints separately and enforces them automatically, while achieving 24.3\% better performance over current approaches to persistence.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Zhang:2017:PPD, author = "Tong Zhang and Changhee Jung and Dongyoon Lee", title = "{ProRace}: Practical Data Race Detection for Production Use", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "149--162", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037708", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper presents ProRace, a dynamic data race detector practical for production runs. It is lightweight, but still offers high race detection capability. To track memory accesses, ProRace leverages instruction sampling using the performance monitoring unit (PMU) in commodity processors. Our PMU driver enables ProRace to sample more memory accesses at a lower cost compared to the state-of-the-art Linux driver. Moreover, ProRace uses PMU-provided execution contexts including register states and program path, and reconstructs unsampled memory accesses offline. This technique allows \ProRace to overcome inherent limitations of sampling and improve the detection coverage by performing data race detection on the trace with not only sampled but also reconstructed memory accesses. Experiments using racy production software including apache and mysql shows that, with a reasonable offline cost, ProRace incurs only 2.6\% overhead at runtime with 27.5\% detection probability with a sampling period of 10,000.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Olson:2017:CGM, author = "Lena E. Olson and Mark D. Hill and David A. Wood", title = "Crossing Guard: Mediating Host-Accelerator Coherence Interactions", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "163--176", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037715", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Specialized hardware accelerators have performance and energy-efficiency advantages over general-purpose processors. To fully realize these benefits and aid programmability, accelerators may share a physical and virtual address space and full cache coherence with the host system. However, allowing accelerators --- particularly those designed by third parties --- to directly communicate with host coherence protocols poses several problems. Host coherence protocols are complex, vary between companies, and may be proprietary, increasing burden on accelerator designers. Bugs in the accelerator implementation may cause crashes and other serious consequences to the host system. We propose Crossing Guard, a coherence interface between the host coherence system and accelerators. The Crossing Guard interface provides the accelerator designer with a standardized set of coherence messages that are simple enough to aid in design of bug-free coherent caches. At the same time, they are sufficiently complex to allow customized and optimized accelerator caches with performance comparable to using the host protocol. The Crossing Guard hardware is implemented as part of the trusted host, and provides complete safety to the host coherence system, even in the presence of a pathologically buggy accelerator cache.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{McMahan:2017:ASF, author = "Joseph McMahan and Michael Christensen and Lawton Nichols and Jared Roesch and Sung-Yee Guo and Ben Hardekopf and Timothy Sherwood", title = "An Architecture Supporting Formal and Compositional Binary Analysis", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "177--191", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037733", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Building a trustworthy life-critical embedded system requires deep reasoning about the potential effects that sequences of machine instructions can have on full system operation. Rather than trying to analyze complete binaries and the countless ways their instructions can interact with one another --- memory, side effects, control registers, implicit state, etc. --- we explore a new approach. We propose an architecture controlled by a thin computational layer designed to tightly correspond with the lambda calculus, drawing on principles of functional programming to bring the assembly much closer to myriad reasoning frameworks, such as the Coq proof assistant. This approach allows assembly-level verified versions of critical code to operate safely in tandem with arbitrary code, including imperative and unverified system components, without the need for large supporting trusted computing bases. We demonstrate that this computational layer can be built in such a way as to simultaneously provide full programmability and compact, precise, and complete semantics, while still using hardware resources comparable to normal embedded systems. To demonstrate the practicality of this approach, our FPGA-implemented prototype runs an embedded medical application which monitors and treats life-threatening arrhythmias. Though the system integrates untrusted and imperative components, our architecture allows for the formal verification of multiple properties of the end-to-end system, including a proof of correctness of the assembly-level implementation of the core algorithm, the integrity of trusted data via a non-interference proof, and a guarantee that our prototype meets critical timing requirements.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Hsiao:2017:ASI, author = "Chun-Hung Hsiao and Satish Narayanasamy and Essam Muhammad Idris Khan and Cristiano L. Pereira and Gilles A. Pokam", title = "{AsyncClock}: Scalable Inference of Asynchronous Event Causality", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "193--205", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037712", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Asynchronous programming model is commonly used in mobile systems and Web 2.0 environments. Asynchronous race detectors use algorithms that are an order of magnitude performance and space inefficient compared to conventional data race detectors. We solve this problem by identifying and addressing two important problems in reasoning about causality between asynchronous events. Unlike conventional signal-wait operations, establishing causal order between two asynchronous events is fundamentally more challenging as there is no common handle they operate on. We propose a new primitive named AsyncClock that addresses this problem by explicitly tracking causally preceding events, and show that AsyncClock can handle a wide variety of asynchronous causality models. We also address the important scalability problem of efficiently identifying heirless events whose metadata can be reclaimed. We built the first single-pass, non-graph-based Android race detector using our algorithm and applied it to find errors in 20 popular applications. Our tool incurs about 6x performance overhead, which is several times more efficient than the state-of-the-art solution. It also scales well with the execution length. We used our tool to find 147 previously unknown harmful races.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Calciu:2017:BBC, author = "Irina Calciu and Siddhartha Sen and Mahesh Balakrishnan and Marcos K. Aguilera", title = "Black-box Concurrent Data Structures for {NUMA} Architectures", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "207--221", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037721", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "High-performance servers are Non-Uniform Memory Access (NUMA) machines. To fully leverage these machines, programmers need efficient concurrent data structures that are aware of the NUMA performance artifacts. We propose Node Replication (NR), a black-box approach to obtaining such data structures. NR takes an arbitrary sequential data structure and automatically transforms it into a NUMA-aware concurrent data structure satisfying linearizability. Using NR requires no expertise in concurrent data structure design, and the result is free of concurrency bugs. NR draws ideas from two disciplines: shared-memory algorithms and distributed systems. Briefly, NR implements a NUMA-aware shared log, and then uses the log to replicate data structures consistently across NUMA nodes. NR is best suited for contended data structures, where it can outperform lock-free algorithms by 3.1x, and lock-based solutions by 30x. To show the benefits of NR to a real application, we apply NR to the data structures of Redis, an in-memory storage system. The result outperforms other methods by up to 14x. The cost of NR is additional memory for its log and replicas.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Vora:2017:CCR, author = "Keval Vora and Chen Tian and Rajiv Gupta and Ziang Hu", title = "{CoRAL}: Confined Recovery in Distributed Asynchronous Graph Processing", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "223--236", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037747", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Existing distributed asynchronous graph processing systems employ checkpointing to capture globally consistent snapshots and rollback all machines to most recent checkpoint to recover from machine failures. In this paper we argue that recovery in distributed asynchronous graph processing does not require the entire execution state to be rolled back to a globally consistent state due to the relaxed asynchronous execution semantics. We define the properties required in the recovered state for it to be usable for correct asynchronous processing and develop CoRAL, a lightweight checkpointing and recovery algorithm. First, this algorithm carries out confined recovery that only rolls back graph execution states of the failed machines to affect recovery. Second, it relies upon lightweight checkpoints that capture locally consistent snapshots with a reduced peak network bandwidth requirement. Our experiments using real-world graphs show that our technique recovers from failures and finishes processing 1.5x to 3.2x faster compared to the traditional asynchronous checkpointing and recovery mechanism when failures impact 1 to 6 machines of a 16 machine cluster. Moreover, capturing locally consistent snapshots significantly reduces intermittent high peak bandwidth usage required to save the snapshots --- the average reduction in 99th percentile bandwidth ranges from 22\% to 51\% while 1 to 6 snapshot replicas are being maintained.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Vora:2017:KFA, author = "Keval Vora and Rajiv Gupta and Guoqing Xu", title = "{KickStarter}: Fast and Accurate Computations on Streaming Graphs via Trimmed Approximations", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "237--251", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037748", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Continuous processing of a streaming graph maintains an approximate result of the iterative computation on a recent version of the graph. Upon a user query, the accurate result on the current graph can be quickly computed by feeding the approximate results to the iterative computation --- a form of incremental computation that corrects the (small amount of) error in the approximate result. Despite the effectiveness of this approach in processing growing graphs, it is generally not applicable when edge deletions are present --- existing approximations can lead to either incorrect results (e.g., monotonic computations terminate at an incorrect minima/maxima) or poor performance (e.g., with approximations, convergence takes longer than performing the computation from scratch). This paper presents KickStarter, a runtime technique that can trim the approximate values for a subset of vertices impacted by the deleted edges. The trimmed approximation is both safe and profitable, enabling the computation to produce correct results and converge quickly. KickStarter works for a class of monotonic graph algorithms and can be readily incorporated in any existing streaming graph system. Our experiments with four streaming algorithms on five large graphs demonstrate that trimming not only produces correct results but also accelerates these algorithms by 8.5--23.7x.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Powers:2017:BBG, author = "Bobby Powers and John Vilk and Emery D. Berger", title = "{Browsix}: Bridging the Gap Between {Unix} and the Browser", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "253--266", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037727", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/unix.bib", abstract = "Applications written to run on conventional operating systems typically depend on OS abstractions like processes, pipes, signals, sockets, and a shared file system. Porting these applications to the web currently requires extensive rewriting or hosting significant portions of code server-side because browsers present a nontraditional runtime environment that lacks OS functionality. This paper presents Browsix, a framework that bridges the considerable gap between conventional operating systems and the browser, enabling unmodified programs expecting a Unix-like environment to run directly in the browser. Browsix comprises two core parts: (1) a JavaScript-only system that makes core Unix features (including pipes, concurrent processes, signals, sockets, and a shared file system) available to web applications; and (2) extended JavaScript runtimes for C, C++, Go, and Node.js that support running programs written in these languages as processes in the browser. Browsix supports running a POSIX shell, making it straightforward to connect applications together via pipes. We illustrate Browsix's capabilities via case studies that demonstrate how it eases porting legacy applications to the browser and enables new functionality. We demonstrate a Browsix-enabled LaTeX editor that operates by executing unmodified versions of pdfLaTeX and BibTeX. This browser-only LaTeX editor can render documents in seconds, making it fast enough to be practical. We further demonstrate how Browsix lets us port a client-server application to run entirely in the browser for disconnected operation. Creating these applications required less than 50 lines of glue code and no code modifications, demonstrating how easily Browsix can be used to build sophisticated web applications from existing parts without modification.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Rajbhandari:2017:OCM, author = "Samyam Rajbhandari and Yuxiong He and Olatunji Ruwase and Michael Carbin and Trishul Chilimbi", title = "Optimizing {CNNs} on Multicores for Scalability, Performance and Goodput", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "267--280", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037745", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Convolutional Neural Networks (CNN) are a class of Artificial Neural Networks (ANN) that are highly efficient at the pattern recognition tasks that underlie difficult AI problems in a variety of domains, such as speech recognition, object recognition, and natural language processing. CNNs are, however, computationally intensive to train. This paper presents the first characterization of the performance optimization opportunities for training CNNs on CPUs. Our characterization includes insights based on the structure of the network itself (i.e., intrinsic arithmetic intensity of the convolution and its scalability under parallelism) as well as dynamic properties of its execution (i.e., sparsity of the computation). Given this characterization, we present an automatic framework called spg-CNN for optimizing CNN training on CPUs. It comprises of a computation scheduler for efficient parallel execution, and two code generators: one that optimizes for sparsity, and the other that optimizes for spatial reuse in convolutions. We evaluate spg-CNN using convolutions from a variety of real world benchmarks, and show that spg-CNN can train CNNs faster than state-of-the-art approaches by an order of magnitude.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Sundararajah:2017:LTN, author = "Kirshanthan Sundararajah and Laith Sakka and Milind Kulkarni", title = "Locality Transformations for Nested Recursive Iteration Spaces", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "281--295", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037720", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "There has been a significant amount of effort invested in designing scheduling transformations such as loop tiling and loop fusion that rearrange the execution of dynamic instances of loop nests to place operations that access the same data close together temporally. In recent years, there has been interest in designing similar transformations that operate on recursive programs, but until now these transformations have only considered simple scenarios: multiple recursions to be fused, or a recursion nested inside a simple loop. This paper develops the first set of scheduling transformations for nested recursions: recursive methods that call other recursive methods. These are the recursive analog to nested loops. We present a transformation called recursion twisting that automatically improves locality at all levels of the memory hierarchy, and show that this transformation can yield substantial performance improvements across several benchmarks that exhibit nested recursion.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Li:2017:LAC, author = "Ang Li and Shuaiwen Leon Song and Weifeng Liu and Xu Liu and Akash Kumar and Henk Corporaal", title = "Locality-Aware {CTA} Clustering for Modern {GPUs}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "297--311", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037709", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Cache is designed to exploit locality; however, the role of on-chip L1 data caches on modern GPUs is often awkward. The locality among global memory requests from different SMs (Streaming Multiprocessors) is predominantly harvested by the commonly-shared L2 with long access latency; while the in-core locality, which is crucial for performance delivery, is handled explicitly by user-controlled scratchpad memory. In this work, we disclose another type of data locality that has been long ignored but with performance boosting potential --- the inter-CTA locality. Exploiting such locality is rather challenging due to unclear hardware feasibility, unknown and inaccessible underlying CTA scheduler, and small in-core cache capacity. To address these issues, we first conduct a thorough empirical exploration on various modern GPUs and demonstrate that inter-CTA locality can be harvested, both spatially and temporally, on L1 or L1/Tex unified cache. Through further quantification process, we prove the significance and commonality of such locality among GPU applications, and discuss whether such reuse is exploitable. By leveraging these insights, we propose the concept of CTA-Clustering and its associated software-based techniques to reshape the default CTA scheduling in order to group the CTAs with potential reuse together on the same SM. Our techniques require no hardware modification and can be directly deployed on existing GPUs. In addition, we incorporate these techniques into an integrated framework for automatic inter-CTA locality optimization. We evaluate our techniques using a wide range of popular GPU applications on all modern generations of NVIDIA GPU architectures. The results show that our proposed techniques significantly improve cache performance through reducing L2 cache transactions by 55\%, 65\%, 29\%, 28\% on average for Fermi, Kepler, Maxwell and Pascal, respectively, leading to an average of 1.46x, 1.48x, 1.45x, 1.41x (up to 3.8x, 3.6x, 3.1x, 3.3x) performance speedups for applications with algorithm-related inter-CTA reuse.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Churchill:2017:SLS, author = "Berkeley Churchill and Rahul Sharma and JF Bastien and Alex Aiken", title = "Sound Loop Superoptimization for {Google Native Client}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "313--326", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037754", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Software fault isolation (SFI) is an important technique for the construction of secure operating systems, web browsers, and other extensible software. We demonstrate that superoptimization can dramatically improve the performance of Google Native Client, a SFI system that ships inside the Google Chrome Browser. Key to our results are new techniques for superoptimization of loops: we propose a new architecture for superoptimization tools that incorporates both a fully sound verification technique to ensure correctness and a bounded verification technique to guide the search to optimized code. In our evaluation we optimize 13 libc string functions, formally verify the correctness of the optimizations and report a median and average speedup of 25\% over the libraries shipped by Google.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Bianchini:2017:IDE, author = "Ricardo Bianchini", title = "Improving Datacenter Efficiency", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "327--327", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3046426", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Internet companies can improve datacenter efficiency and reduce costs, by minimizing resource waste while avoiding (or limiting) performance degradation. In this talk, I will first overview a few of the efficiency-related efforts we are undertaking at Microsoft, including leveraging workload history to improve resource management. I will then discuss some lessons from deploying these efforts in production and how they relate to academic research.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Liu:2017:DBD, author = "Mengxing Liu and Mingxing Zhang and Kang Chen and Xuehai Qian and Yongwei Wu and Weimin Zheng and Jinglei Ren", title = "{DudeTM}: Building Durable Transactions with Decoupling for Persistent Memory", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "329--343", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037714", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Emerging non-volatile memory (NVM) offers non-volatility, byte-addressability and fast access at the same time. To make the best use of these properties, it has been shown by empirical evidence that programs should access NVM directly through CPU load and store instructions, so that the overhead of a traditional file system or database can be avoided. Thus, durable transactions become a common choice of applications for accessing persistent memory data in a crash consistent manner. However, existing durable transaction systems employ either undo logging, which requires a fence for every memory write, or redo logging, which requires intercepting all memory reads within transactions. This paper presents DUDETM, a crash-consistent durable transaction system that avoids the drawbacks of both undo logging and redo logging. DUDETM uses shadow DRAM to decouple the execution of a durable transaction into three fully asynchronous steps. The advantage is that only minimal fences and no memory read instrumentation are required. This design also enables an out-of-the-box transactional memory (TM) to be used as an independent component in our system. The evaluation results show that DUDETM adds durability to a TM system with only 7.4 ~ 24.6\% throughput degradation. Compared to the existing durable transaction systems, DUDETM provides 1.7times to 4.4times higher throughput. Moreover, DUDETM can be implemented with existing hardware TMs with minor hardware modifications, leading to a further 1.7times speedup.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Klimovic:2017:RRF, author = "Ana Klimovic and Heiner Litz and Christos Kozyrakis", title = "{ReFlex}: Remote Flash $ \approx $ Local Flash", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "345--359", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037732", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Remote access to NVMe Flash enables flexible scaling and high utilization of Flash capacity and IOPS within a datacenter. However, existing systems for remote Flash access either introduce significant performance overheads or fail to isolate the multiple remote clients sharing each Flash device. We present ReFlex, a software-based system for remote Flash access, that provides nearly identical performance to accessing local Flash. ReFlex uses a dataplane kernel to closely integrate networking and storage processing to achieve low latency and high throughput at low resource requirements. Specifically, ReFlex can serve up to 850K IOPS per core over TCP/IP networking, while adding 21us over direct access to local Flash. ReFlex uses a QoS scheduler that can enforce tail latency and throughput service-level objectives (SLOs) for thousands of remote clients. We show that ReFlex allows applications to use remote Flash while maintaining their original performance with local Flash.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Jevdjic:2017:ASC, author = "Djordje Jevdjic and Karin Strauss and Luis Ceze and Henrique S. Malvar", title = "Approximate Storage of Compressed and Encrypted Videos", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "361--373", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037718", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The popularization of video capture devices has created strong storage demand for encoded videos. Approximate storage can ease this demand by enabling denser storage at the expense of occasional errors. Unfortunately, even minor storage errors, such as bit flips, can result in major visual damage in encoded videos. Similarly, video encryption, widely employed for privacy and digital rights management, may create long dependencies between bits that show little or no tolerance to storage errors. In this paper we propose VideoApp, a novel and efficient methodology to compute bit-level reliability requirements for encoded videos by tracking visual and metadata dependencies within encoded bitstreams. We further show how VideoApp can be used to trade video quality for storage density in an optimal way. We integrate our methodology into a popular H.264 encoder to partition an encoded video stream into multiple streams that can receive different levels of error correction according to their reliability needs. When applied to a dense and highly error-prone multi-level cell storage substrate, our variable error correction mechanism reduces the error correction overhead by half under the most error-intolerant encoder settings, achieving quality/density points that neither compression nor approximation can achieve alone. Finally, we define the basic invariants needed to support encrypted approximate video storage. We present an analysis of block cipher modes of operation, showing that some are fully compatible with approximation, enabling approximate and secure video storage systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Elyasi:2017:EIR, author = "Nima Elyasi and Mohammad Arjomand and Anand Sivasubramaniam and Mahmut T. Kandemir and Chita R. Das and Myoungsoo Jung", title = "Exploiting Intra-Request Slack to Improve {SSD} Performance", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "375--388", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037728", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With Solid State Disks (SSDs) offering high degrees of parallelism, SSD controllers place data and direct requests to exploit the maximum offered hardware parallelism. In the quest to maximize parallelism and utilization, sub-requests of a request that are directed to different flash chips by the scheduler can experience differential wait times since their individual queues are not coordinated and load balanced at all times. Since the macro request is considered complete only when its last sub-request completes, some of its sub-requests that complete earlier have to necessarily wait for this last sub-request. This paper opens the door to a new class of schedulers to leverage such slack between sub-requests in order to improve response times. Specifically, the paper presents the design and implementation of a slack-enabled re-ordering scheduler, called Slacker, for sub-requests issued to each flash chip. Layered under a modern SSD request scheduler, Slacker estimates the slack of each incoming sub-request to a flash chip and allows them to jump ahead of existing sub-requests with sufficient slack so as to not detrimentally impact their response times. Slacker is simple to implement and imposes only marginal additions to the hardware. Using a spectrum of 21 workloads with diverse read-write characteristics, we show that Slacker provides as much as 19.5\%, 13\% and 14.5\% improvement in response times, with average improvements of 12\%, 6.5\% and 8.5\%, for write-intensive, read-intensive and read-write balanced workloads, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Wang:2017:GSM, author = "Kai Wang and Aftab Hussain and Zhiqiang Zuo and Guoqing Xu and Ardalan Amiri Sani", title = "{Graspan}: a Single-machine Disk-based Graph System for Interprocedural Static Analyses of Large-scale Systems Code", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "389--404", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037744", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "There is more than a decade-long history of using static analysis to find bugs in systems such as Linux. Most of the existing static analyses developed for these systems are simple checkers that find bugs based on pattern matching. Despite the presence of many sophisticated interprocedural analyses, few of them have been employed to improve checkers for systems code due to their complex implementations and poor scalability. In this paper, we revisit the scalability problem of interprocedural static analysis from a ``Big Data'' perspective. That is, we turn sophisticated code analysis into Big Data analytics and leverage novel data processing techniques to solve this traditional programming language problem. We develop Graspan, a disk-based parallel graph system that uses an edge-pair centric computation model to compute dynamic transitive closures on very large program graphs. We implement context-sensitive pointer/alias and dataflow analyses on Graspan. An evaluation of these analyses on large codebases such as Linux shows that their Graspan implementations scale to millions of lines of code and are much simpler than their original implementations. Moreover, we show that these analyses can be used to augment the existing checkers; these augmented checkers uncovered 132 new NULL pointer bugs and 1308 unnecessary NULL tests in Linux 4.4.0-rc5, PostgreSQL 8.3.9, and Apache httpd 2.2.18.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Ren:2017:SDH, author = "Ao Ren and Zhe Li and Caiwen Ding and Qinru Qiu and Yanzhi Wang and Ji Li and Xuehai Qian and Bo Yuan", title = "{SC-DCNN}: Highly-Scalable Deep Convolutional Neural Network using Stochastic Computing", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "405--418", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037746", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With the recent advance of wearable devices and Internet of Things (IoTs), it becomes attractive to implement the Deep Convolutional Neural Networks (DCNNs) in embedded and portable systems. Currently, executing the software-based DCNNs requires high-performance servers, restricting the widespread deployment on embedded and mobile IoT devices. To overcome this obstacle, considerable research efforts have been made to develop highly-parallel and specialized DCNN accelerators using GPGPUs, FPGAs or ASICs. Stochastic Computing (SC), which uses a bit-stream to represent a number within [-1, 1] by counting the number of ones in the bit-stream, has high potential for implementing DCNNs with high scalability and ultra-low hardware footprint. Since multiplications and additions can be calculated using AND gates and multiplexers in SC, significant reductions in power (energy) and hardware footprint can be achieved compared to the conventional binary arithmetic implementations. The tremendous savings in power (energy) and hardware resources allow immense design space for enhancing scalability and robustness for hardware DCNNs. This paper presents SC-DCNN, the first comprehensive design and optimization framework of SC-based DCNNs, using a bottom-up approach. We first present the designs of function blocks that perform the basic operations in DCNN, including inner product, pooling, and activation function. Then we propose four designs of feature extraction blocks, which are in charge of extracting features from input feature maps, by connecting different basic function blocks with joint optimization. Moreover, the efficient weight storage methods are proposed to reduce the area and power (energy) consumption. Putting all together, with feature extraction blocks carefully selected, SC-DCNN is holistically optimized to minimize area and power (energy) consumption while maintaining high network accuracy. Experimental results demonstrate that the LeNet5 implemented in SC-DCNN consumes only 17 mm$^2$ area and 1.53 W power, achieves throughput of 781250 images/s, area efficiency of 45946 images/s/ mm$^2$, and energy efficiency of 510734 images/J.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Ajay:2017:GIL, author = "Jerry Ajay and Chen Song and Aditya Singh Rathore and Chi Zhou and Wenyao Xu", title = "{$3$DGates}: an Instruction-Level Energy Analysis and Optimization of {$3$D} Printers", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "419--433", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037752", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As the next-generation manufacturing driven force, 3D printing technology is having a transformative effect on various industrial domains and has been widely applied in a broad spectrum of applications. It also progresses towards other versatile fields with portable battery-powered 3D printers working on a limited energy budget. While reducing manufacturing energy is an essential challenge in industrial sustainability and national economics, this growing trend motivates us to explore the energy consumption of the 3D printer for the purpose of energy efficiency. To this end, we perform an in-depth analysis of energy consumption in commercial, off-the-shelf 3D printers from an instruction-level perspective. We build an instruction-level energy model and an energy profiler to analyze the energy cost during the fabrication process. From the insights obtained by the energy profiler, we propose and implement a cross-layer energy optimization solution, called 3DGates, which spans the instruction-set, the compiler and the firmware. We evaluate 3DGates over 338 benchmarks on a 3D printer and achieve an overall energy reduction of 25\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Cox:2017:EAT, author = "Guilherme Cox and Abhishek Bhattacharjee", title = "Efficient Address Translation for Architectures with Multiple Page Sizes", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "435--448", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037704", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Processors and operating systems (OSes) support multiple memory page sizes. Superpages increase Translation Lookaside Buffer (TLB) hits, while small pages provide fine-grained memory protection. Ideally, TLBs should perform well for any distribution of page sizes. In reality, set-associative TLBs --- used frequently for their energy efficiency compared to fully-associative TLBs --- cannot (easily) support multiple page sizes concurrently. Instead, commercial systems typically implement separate set-associative TLBs for different page sizes. This means that when superpages are allocated aggressively, TLB misses may, counter intuitively, increase even if entries for small pages remain unused (and vice-versa). We invent MIX TLBs, energy-frugal set-associative structures that concurrently support all page sizes by exploiting superpage allocation patterns. MIX TLBs boost the performance (often by 10-30\%) of big-memory applications on native CPUs, virtualized CPUs, and GPUs. MIX TLBs are simple and require no OS or program changes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Lesokhin:2017:PFS, author = "Ilya Lesokhin and Haggai Eran and Shachar Raindel and Guy Shapiro and Sagi Grimberg and Liran Liss and Muli Ben-Yehuda and Nadav Amit and Dan Tsafrir", title = "Page Fault Support for Network Controllers", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "449--466", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037710", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Direct network I/O allows network controllers (NICs) to expose multiple instances of themselves, to be used by untrusted software without a trusted intermediary. Direct I/O thus frees researchers from legacy software, fueling studies that innovate in multitenant setups. Such studies, however, overwhelmingly ignore one serious problem: direct memory accesses (DMAs) of NICs disallow page faults, forcing systems to either pin entire address spaces to physical memory and thereby hinder memory utilization, or resort to APIs that pin/unpin memory buffers before/after they are DMAed, which complicates the programming model and hampers performance. We solve this problem by designing and implementing page fault support for InfiniBand and Ethernet NICs. A main challenge we tackle---unique to NICs---is handling receive DMAs that trigger page faults, leaving the NIC without memory to store the incoming data. We demonstrate that our solution provides all the benefits associated with ``regular'' virtual memory, notably (1) a simpler programming model that rids users from the need to pin, and (2) the ability to employ all the canonical memory optimizations, such as memory overcommitment and demand-paging based on actual use. We show that, as a result, benchmark performance improves by up to 1.9x.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Hu:2017:TFC, author = "Yang Hu and Mingcong Song and Tao Li", title = "Towards {``Full Containerization''} in Containerized Network Function Virtualization", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "467--481", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037713", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With exploding traffic stuffing existing network infra-structure, today's telecommunication and cloud service providers resort to Network Function Virtualization (NFV) for greater agility and economics. Pioneer service provider such as AT{\&}T proposes to adopt container in NFV to achieve shorter Virtualized Network Function (VNF) provisioning time and better runtime performance. However, we characterize typical NFV work-loads on the containers and find that the performance is unsatisfactory. We observe that the shared host OS net-work stack is the main bottleneck, where the traffic flow processing involves a large amount of intermediate memory buffers and results in significant last level cache pollution. Existing OS memory allocation policies fail to exploit the locality and data sharing information among buffers. In this paper, we propose NetContainer, a software framework that achieves fine-grained hardware resource management for containerized NFV platform. NetContainer employs a cache access overheads guided page coloring scheme to coordinately address the inter-flow cache access overheads and intra-flow cache access overheads. It maps the memory buffer pages that manifest low cache access overheads (across a flow or among the flows) to the same last level cache partition. NetContainer exploits a footprint theory based method to estimate the cache access overheads and a Min-Cost Max-Flow model to guide the memory buffer mappings. We implement the NetContainer in Linux kernel and extensively evaluate it with real NFV workloads. Experimental results show that NetContainer outperforms conventional page coloring-based memory allocator by 48\% in terms of successful call rate.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Wu:2017:FEF, author = "Bo Wu and Xu Liu and Xiaobo Zhou and Changjun Jiang", title = "{FLEP}: Enabling Flexible and Efficient Preemption on {GPUs}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "483--496", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037742", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "GPUs are widely adopted in HPC and cloud computing platforms to accelerate general-purpose workloads. However, modern GPUs do not support flexible preemption, leading to performance and priority inversion problems in multi-tasking environments. In this paper, we propose and develop FLEP, the first software system that enables flexible kernel preemption and kernel scheduling on commodity GPUs. The FLEP compilation engine transforms the GPU program into preemptable forms, which can be interrupted during execution and yield all or part of the streaming multi-processors (SMs) in the GPU. The FLEP runtime engine intercepts all kernel invocations and determines which kernels and how those kernels should be preempted and scheduled. Experimental results on two-kernel co-runs demonstrate up to 24.2X speedup for high-priority kernels and up to 27X improvement on normalized average turnaround time for kernels with the same priority. FLEP reduces the preemption latency by up to 41\% compared to yielding the whole GPU when the waiting kernels only need several SMs. With all the benefits, FLEP only introduces 2.5\% runtime overhead, which is substantially lower than the kernel slicing approach.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Li:2017:SSA, author = "Kaiwei Li and Jianfei Chen and Wenguang Chen and Jun Zhu", title = "{SaberLDA}: Sparsity-Aware Learning of Topic Models on {GPUs}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "497--509", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037740", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Latent Dirichlet Allocation (LDA) is a popular tool for analyzing discrete count data such as text and images. Applications require LDA to handle both large datasets and a large number of topics. Though distributed CPU systems have been used, GPU-based systems have emerged as a promising alternative because of the high computational power and memory bandwidth of GPUs. However, existing GPU-based LDA systems cannot support a large number of topics because they use algorithms on dense data structures whose time and space complexity is linear to the number of topics. In this paper, we propose SaberLDA, a GPU-based LDA system that implements a sparsity-aware algorithm to achieve sublinear time complexity and scales well to learn a large number of topics. To address the challenges introduced by sparsity, we propose a novel data layout, a new warp-based sampling kernel, and an efficient sparse count matrix updating algorithm that improves locality, makes efficient utilization of GPU warps, and reduces memory consumption. Experiments show that SaberLDA can learn from billions-token-scale data with up to 10,000 topics, which is almost two orders of magnitude larger than that of the previous GPU-based systems. With a single GPU card, SaberLDA is able to learn 10,000 topics from a dataset of billions of tokens in a few hours, which is only achievable with clusters with tens of machines before.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Khazraee:2017:MNO, author = "Moein Khazraee and Lu Zhang and Luis Vega and Michael Bedford Taylor", title = "{Moonwalk}: {NRE} Optimization in {ASIC} Clouds", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "511--526", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037749", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Cloud services are becoming increasingly globalized and data-center workloads are expanding exponentially. GPU and FPGA-based clouds have illustrated improvements in power and performance by accelerating compute-intensive workloads. ASIC-based clouds are a promising way to optimize the Total Cost of Ownership (TCO) of a given datacenter computation (e.g. YouTube transcoding) by reducing both energy consumption and marginal computation cost. The feasibility of an ASIC Cloud for a particular application is directly gated by the ability to manage the Non-Recurring Engineering (NRE) costs of designing and fabricating the ASIC, so that it is significantly lower (e.g. 2X) than the TCO of the best available alternative. In this paper, we show that technology node selection is a major tool for managing ASIC Cloud NRE, and allows the designer to trade off an accelerator's excess energy efficiency and cost performance for lower total cost. We explore NRE and cross-technology optimization of ASIC Clouds for four different applications: Bitcoin mining, YouTube-style video transcoding, Litecoin, and Deep Learning. We address these challenges and show large reductions in the NRE, potentially enabling ASIC Clouds to address a wider variety of datacenter workloads. Our results suggest that advanced nodes like 16nm will lead to sub-optimal TCO for many workloads, and that use of older nodes like 65nm can enable a greater diversity of ASIC Clouds.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Park:2017:DRM, author = "Jason Jong Kyu Park and Yongjun Park and Scott Mahlke", title = "Dynamic Resource Management for Efficient Utilization of Multitasking {GPUs}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "527--540", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037707", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As graphics processing units (GPUs) are broadly adopted, running multiple applications on a GPU at the same time is beginning to attract wide attention. Recent proposals on multitasking GPUs have focused on either spatial multitasking, which partitions GPU resource at a streaming multiprocessor (SM) granularity, or simultaneous multikernel (SMK), which runs multiple kernels on the same SM. However, multitasking performance varies heavily depending on the resource partitions within each scheme, and the application mixes. In this paper, we propose GPU Maestro that performs dynamic resource management for efficient utilization of multitasking GPUs. GPU Maestro can discover the best performing GPU resource partition exploiting both spatial multitasking and SMK. Furthermore, dynamism within a kernel and interference between the kernels are automatically considered because GPU Maestro finds the best performing partition through direct measurements. Evaluations show that GPU Maestro can improve average system throughput by 20.2\% and 13.9\% over the baseline spatial multitasking and SMK, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Zhang:2017:ISC, author = "Rui Zhang and Natalie Stanley and Christopher Griggs and Andrew Chi and Cynthia Sturton", title = "Identifying Security Critical Properties for the Dynamic Verification of a Processor", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "541--554", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037734", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We present a methodology for identifying security critical properties for use in the dynamic verification of a processor. Such verification has been shown to be an effective way to prevent exploits of vulnerabilities in the processor, given a meaningful set of security properties. We use known processor errata to establish an initial set of security-critical invariants of the processor. We then use machine learning to infer an additional set of invariants that are not tied to any particular, known vulnerability, yet are critical to security. We build a tool chain implementing the approach and evaluate it for the open-source OR1200 RISC processor. We find that our tool can identify 19 (86.4\%) of the 22 manually crafted security-critical properties from prior work and generates 3 new security properties not covered in prior work.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Ferraiuolo:2017:VPH, author = "Andrew Ferraiuolo and Rui Xu and Danfeng Zhang and Andrew C. Myers and G. Edward Suh", title = "Verification of a Practical Hardware Security Architecture Through Static Information Flow Analysis", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "555--568", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037739", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Hardware-based mechanisms for software isolation are becoming increasingly popular, but implementing these mechanisms correctly has proved difficult, undermining the root of security. This work introduces an effective way to formally verify important properties of such hardware security mechanisms. In our approach, hardware is developed using a lightweight security-typed hardware description language (HDL) that performs static information flow analysis. We show the practicality of our approach by implementing and verifying a simplified but realistic multi-core prototype of the ARM TrustZone architecture. To make the security-typed HDL expressive enough to verify a realistic processor, we develop new type system features. Our experiments suggest that information flow analysis is efficient, and programmer effort is modest. We also show that information flow constraints are an effective way to detect hardware vulnerabilities, including several found in commercial processors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Chisnall:2017:CJS, author = "David Chisnall and Brooks Davis and Khilan Gudka and David Brazdil and Alexandre Joannou and Jonathan Woodruff and A. Theodore Markettos and J. Edward Maste and Robert Norton and Stacey Son and Michael Roe and Simon W. Moore and Peter G. Neumann and Ben Laurie and Robert N. M. Watson", title = "{CHERI JNI}: Sinking the {Java} Security Model into the {C}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "569--583", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037725", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Java provides security and robustness by building a high-level security model atop the foundation of memory protection. Unfortunately, any native code linked into a Java program --- including the million lines used to implement the standard library --- is able to bypass both the memory protection and the higher-level policies. We present a hardware-assisted implementation of the Java native code interface, which extends the guarantees required for Java's security model to native code. Our design supports safe direct access to buffers owned by the JVM, including hardware-enforced read-only access where appropriate. We also present Java language syntax to declaratively describe isolated compartments for native code. We show that it is possible to preserve the memory safety and isolation requirements of the Java security model in C code, allowing native code to run in the same process as Java code with the same impact on security as running equivalent Java code. Our approach has a negligible impact on performance, compared with the existing unsafe native code interface. We demonstrate a prototype implementation running on the CHERI microprocessor synthesized in FPGA.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Ge:2017:GGC, author = "Xinyang Ge and Weidong Cui and Trent Jaeger", title = "{GRIFFIN}: Guarding Control Flows Using {Intel} Processor Trace", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "585--598", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037716", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Researchers are actively exploring techniques to enforce control-flow integrity (CFI), which restricts program execution to a predefined set of targets for each indirect control transfer to prevent code-reuse attacks. While hardware-assisted CFI enforcement may have the potential for advantages in performance and flexibility over software instrumentation, current hardware-assisted defenses are either incomplete (i.e., do not enforce all control transfers) or less efficient in comparison. We find that the recent introduction of hardware features to log complete control-flow traces, such as Intel Processor Trace (PT), provides an opportunity to explore how efficient and flexible a hardware-assisted CFI enforcement system may become. While Intel PT was designed to aid in offline debugging and failure diagnosis, we explore its effectiveness for online CFI enforcement over unmodified binaries by designing a parallelized method for enforcing various types of CFI policies. We have implemented a prototype called GRIFFIN in the Linux 4.2 kernel that enables complete CFI enforcement over a variety of software, including the Firefox browser and its jitted code. Our experiments show that GRIFFIN can enforce fine-grained CFI policies with shadow stack as recommended by researchers at a performance that is comparable to software-only instrumentation techniques. In addition, we find that alternative logging approaches yield significant performance improvements for trace processing, identifying opportunities for further hardware assistance.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Delimitrou:2017:BKW, author = "Christina Delimitrou and Christos Kozyrakis", title = "{Bolt}: {I} Know What You Did Last Summer \ldots{} In The Cloud", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "599--613", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037703", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Cloud providers routinely schedule multiple applications per physical host to increase efficiency. The resulting interference on shared resources often leads to performance degradation and, more importantly, security vulnerabilities. Interference can leak important information ranging from a service's placement to confidential data, like private keys. We present Bolt, a practical system that accurately detects the type and characteristics of applications sharing a cloud platform based on the interference an adversary sees on shared resources. Bolt leverages online data mining techniques that only require 2-5 seconds for detection. In a multi-user study on EC2, Bolt correctly identifies the characteristics of 385 out of 436 diverse workloads. Extracting this information enables a wide spectrum of previously-impractical cloud attacks, including denial of service attacks (DoS) that increase tail latency by 140x, as well as resource freeing (RFA) and co-residency attacks. Finally, we show that while advanced isolation mechanisms, such as cache partitioning lower detection accuracy, they are insufficient to eliminate these vulnerabilities altogether. To do so, one must either disallow core sharing, or only allow it between threads of the same application, leading to significant inefficiencies and performance penalties.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Kang:2017:NCI, author = "Yiping Kang and Johann Hauswald and Cao Gao and Austin Rovinski and Trevor Mudge and Jason Mars and Lingjia Tang", title = "{Neurosurgeon}: Collaborative Intelligence Between the Cloud and Mobile Edge", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "615--629", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037698", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The computation for today's intelligent personal assistants such as Apple Siri, Google Now, and Microsoft Cortana, is performed in the cloud. This cloud-only approach requires significant amounts of data to be sent to the cloud over the wireless network and puts significant computational pressure on the datacenter. However, as the computational resources in mobile devices become more powerful and energy efficient, questions arise as to whether this cloud-only processing is desirable moving forward, and what are the implications of pushing some or all of this compute to the mobile devices on the edge. In this paper, we examine the status quo approach of cloud-only processing and investigate computation partitioning strategies that effectively leverage both the cycles in the cloud and on the mobile device to achieve low latency, low energy consumption, and high datacenter throughput for this class of intelligent applications. Our study uses 8 intelligent applications spanning computer vision, speech, and natural language domains, all employing state-of-the-art Deep Neural Networks (DNNs) as the core machine learning technique. We find that given the characteristics of DNN algorithms, a fine-grained, layer-level computation partitioning strategy based on the data and computation variations of each layer within a DNN has significant latency and energy advantages over the status quo approach. Using this insight, we design Neurosurgeon, a lightweight scheduler to automatically partition DNN computation between mobile devices and datacenters at the granularity of neural network layers. Neurosurgeon does not require per-application profiling. It adapts to various DNN architectures, hardware platforms, wireless networks, and server load levels, intelligently partitioning computation for best latency or best mobile energy. We evaluate Neurosurgeon on a state-of-the-art mobile development platform and show that it improves end-to-end latency by 3.1X on average and up to 40.7X, reduces mobile energy consumption by 59.5\% on average and up to 94.7\%, and improves datacenter throughput by 1.5X on average and up to 6.7X.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Agarwal:2017:TAT, author = "Neha Agarwal and Thomas F. Wenisch", title = "{Thermostat}: Application-transparent Page Management for Two-tiered Main Memory", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "631--644", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037706", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The advent of new memory technologies that are denser and cheaper than commodity DRAM has renewed interest in two-tiered main memory schemes. Infrequently accessed application data can be stored in such memories to achieve significant memory cost savings. Past research on two-tiered main memory has assumed a 4KB page size. However, 2MB huge pages are performance critical in cloud applications with large memory footprints, especially in virtualized cloud environments, where nested paging drastically increases the cost of 4KB page management. We present Thermostat, an application-transparent huge-page-aware mechanism to place pages in a dual-technology hybrid memory system while achieving both the cost advantages of two-tiered memory and performance advantages of transparent huge pages. We present an online page classification mechanism that accurately classifies both 4KB and 2MB pages as hot or cold while incurring no observable performance overhead across several representative cloud applications. We implement Thermostat in Linux kernel version 4.5 and evaluate its effectiveness on representative cloud computing workloads running under KVM virtualization. We emulate slow memory with performance characteristics approximating near-future high-density memory technology and show that Thermostat migrates up to 50\% of application footprint to slow memory while limiting performance degradation to 3\%, thereby reducing memory cost up to 30\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Barbalace:2017:BBH, author = "Antonio Barbalace and Robert Lyerly and Christopher Jelesnianski and Anthony Carno and Ho-Ren Chuang and Vincent Legout and Binoy Ravindran", title = "Breaking the Boundaries in Heterogeneous-{ISA} Datacenters", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "645--659", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037738", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Energy efficiency is one of the most important design considerations in running modern datacenters. Datacenter operating systems rely on software techniques such as execution migration to achieve energy efficiency across pools of machines. Execution migration is possible in datacenters today because they consist mainly of homogeneous-ISA machines. However, recent market trends indicate that alternate ISAs such as ARM and PowerPC are pushing into the datacenter, meaning current execution migration techniques are no longer applicable. How can execution migration be applied in future heterogeneous-ISA datacenters? In this work we present a compiler, runtime, and an operating system extension for enabling execution migration between heterogeneous-ISA servers. We present a new multi-ISA binary architecture and heterogeneous-OS containers for facilitating efficient migration of natively-compiled applications. We build and evaluate a prototype of our design and demonstrate energy savings of up to 66\% for a workload running on an ARM and an x86 server interconnected by a high-speed network.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Lustig:2017:ASC, author = "Daniel Lustig and Andrew Wright and Alexandros Papakonstantinou and Olivier Giroux", title = "Automated Synthesis of Comprehensive Memory Model Litmus Test Suites", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "661--675", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037723", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The memory consistency model is a fundamental part of any shared memory architecture or programming model. Modern weak memory models are notoriously difficult to define and to implement correctly. Most real-world programming languages, compilers, and (micro)architectures therefore rely heavily on black-box testing methodologies. The success of such techniques requires that the suite of litmus tests used to perform the testing be comprehensive--it should ideally stress all obscure corner cases of the model and of its implementation. Most litmus test suites today are generated from some combination of manual effort and randomization; however, the complex and subtle nature of contemporary memory models means that manual effort is both error-prone and subject to incomplete coverage. This paper presents a methodology for synthesizing comprehensive litmus test suites directly from a memory model specification. By construction, these suites contain all tests satisfying a minimality criterion: that no synchronization mechanism in the test can be weakened without causing new behaviors to become observable. We formalize this notion using the Alloy modeling language, and we apply it to a number of existing and newly-proposed memory models. Our results show not only that this synthesis technique can automatically reproduce all manually-generated tests from existing suites, but also that it discovers new tests that are not as well studied.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Liu:2017:DAD, author = "Haopeng Liu and Guangpu Li and Jeffrey F. Lukman and Jiaxin Li and Shan Lu and Haryadi S. Gunawi and Chen Tian", title = "{DCatch}: Automatically Detecting Distributed Concurrency Bugs in Cloud Systems", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "677--691", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037735", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In big data and cloud computing era, reliability of distributed systems is extremely important. Unfortunately, distributed concurrency bugs, referred to as DCbugs, widely exist. They hide in the large state space of distributed cloud systems and manifest non-deterministically depending on the timing of distributed computation and communication. Effective techniques to detect DCbugs are desired. This paper presents a pilot solution, DCatch, in the world of DCbug detection. DCatch predicts DCbugs by analyzing correct execution of distributed systems. To build DCatch, we design a set of happens-before rules that model a wide variety of communication and concurrency mechanisms in real-world distributed cloud systems. We then build runtime tracing and trace analysis tools to effectively identify concurrent conflicting memory accesses in these systems. Finally, we design tools to help prune false positives and trigger DCbugs. We have evaluated DCatch on four representative open-source distributed cloud systems, Cassandra, Hadoop MapReduce, HBase, and ZooKeeper. By monitoring correct execution of seven workloads on these systems, DCatch reports 32 DCbugs, with 20 of them being truly harmful.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Mashtizadeh:2017:TPD, author = "Ali Jos{\'e} Mashtizadeh and Tal Garfinkel and David Terei and David Mazieres and Mendel Rosenblum", title = "Towards Practical Default-On Multi-Core Record\slash Replay", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "693--708", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037751", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "We present Castor, a record/replay system for multi-core applications that provides consistently low and predictable overheads. With Castor, developers can leave record and replay on by default, making it practical to record and reproduce production bugs, or employ fault tolerance to recover from hardware failures. Castor is inspired by several observations: First, an efficient mechanism for logging non-deterministic events is critical for recording demanding workloads with low overhead. Through careful use of hardware we were able to increase log throughput by 10x or more, e.g., we could record a server handling 10x more requests per second for the same record overhead. Second, most applications can be recorded without modifying source code by using the compiler to instrument language level sources of non-determinism, in conjunction with more familiar techniques like shared library interposition. Third, while Castor cannot deterministically replay all data races, this limitation is generally unimportant in practice, contrary to what prior work has assumed. Castor currently supports applications written in C, C++, and Go on FreeBSD. We have evaluated Castor on parallel and server workloads, including a commercial implementation of memcached in Go, which runs Castor in production.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Huang:2017:PSA, author = "Jian Huang and Michael Allen-Bond and Xuechen Zhang", title = "{Pallas}: Semantic-Aware Checking for Finding Deep Bugs in Fast Path", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "709--722", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037743", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Software optimization is constantly a serious concern for developing high-performance systems. To accelerate the workflow execution of a specific functionality, software developers usually define and implement a fast path to speed up the critical and commonly executed functions in the workflow. However, producing a bug-free fast path is nontrivial. Our study on the Linux kernel discloses that a committed fast path can have up to 19 follow-up patches for bug fixing, and most of them are deep semantic bugs, which are difficult to be pinpointed by existing bug-finding tools. In this paper, we present such a new category of software bugs based on our fast-path bug study across various system software including virtual memory manager, file systems, network, and device drivers. We investigate their root causes and identify five error-prone aspects in a fast path: path state, trigger condition, path output, fault handling, and assistant data structure. We find that many of the deep bugs can be prevented by applying static analysis incorporating simple semantic information. We extract a set of rules based on our findings and build a toolkit PALLAS to check fast-path bugs. The evaluation results show that PALLAS can effectively reveal fast-path bugs in a variety of systems including Linux kernel, mobile operating system, software-defined networking system, and web browser.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Kotra:2017:HSC, author = "Jagadish B. Kotra and Narges Shahidi and Zeshan A. Chishti and Mahmut T. Kandemir", title = "Hardware-Software Co-design to Mitigate {DRAM} Refresh Overheads: a Case for Refresh-Aware Process Scheduling", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "723--736", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037724", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "DRAM cells need periodic refresh to maintain data integrity. With high capacity DRAMs, DRAM refresh poses a significant performance bottleneck as the number of rows to be refreshed (and hence the refresh cycle time, tRFC) with each refresh command increases. Modern day DRAMs perform refresh at a rank-level, while LPDDRs used in mobile environments support refresh at a per-bank level. Rank-level refresh degrades the performance significantly since none of the banks in a rank can serve the on-demand requests. Per-bank refresh alleviates some of the performance bottlenecks as the other banks in a rank are available for on-demand requests. Typical DRAM retention time is in the order several of milliseconds, viz, 64msec for environments operating in temperatures below 85 deg C and 32msec for environments operating above 85 deg C. With systems moving towards increased consolidation (ex: virtualized environments), DRAM refresh becomes a significant bottleneck as it reduces the available overall DRAM bandwidth per task. In this work, we propose a hardware-software co-design to mitigate DRAM refresh overheads by exposing the hardware address mapping and DRAM refresh schedule to the Operating System. We propose a novel DRAM refresh-aware process scheduling algorithm in OS which schedules applications on cores such that none of the on-demand requests from the application are stalled by refreshes. Extensive evaluation of our proposed co-design on multi-programmed SPEC CPU2006 workloads show significant performance improvement compared to the previously proposed hardware only approaches.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Kim:2017:KPC, author = "Jinchun Kim and Elvira Teran and Paul V. Gratz and Daniel A. Jim{\'e}nez and Seth H. Pugsley and Chris Wilkerson", title = "Kill the Program Counter: Reconstructing Program Behavior in the Processor Cache Hierarchy", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "737--749", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037701", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Data prefetching and cache replacement algorithms have been intensively studied in the design of high performance microprocessors. Typically, the data prefetcher operates in the private caches and does not interact with the replacement policy in the shared Last-Level Cache (LLC). Similarly, most replacement policies do not consider demand and prefetch requests as different types of requests. In particular, program counter (PC)-based replacement policies cannot learn from prefetch requests since the data prefetcher does not generate a PC value. PC-based policies can also be negatively affected by compiler optimizations. In this paper, we propose a holistic cache management technique called Kill-the-PC (KPC) that overcomes the weaknesses of traditional prefetching and replacement policy algorithms. KPC cache management has three novel contributions. First, a prefetcher which approximates the future use distance of prefetch requests based on its prediction confidence. Second, a simple replacement policy provides similar or better performance than current state-of-the-art PC-based prediction using global hysteresis. Third, KPC integrates prefetching and replacement policy into a whole system which is greater than the sum of its parts. Information from the prefetcher is used to improve the performance of the replacement policy and vice-versa. Finally, KPC removes the need to propagate the PC through entire on-chip cache hierarchy while providing a holistic cache management approach with better performance than state-of-the-art PC-, and non-PC-based schemes. Our evaluation shows that KPC provides 8\% better performance than the best combination of existing prefetcher and replacement policy for multi-core workloads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Gao:2017:TSE, author = "Mingyu Gao and Jing Pu and Xuan Yang and Mark Horowitz and Christos Kozyrakis", title = "{TETRIS}: Scalable and Efficient Neural Network Acceleration with {$3$D} Memory", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "751--764", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037702", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The high accuracy of deep neural networks (NNs) has led to the development of NN accelerators that improve performance by two orders of magnitude. However, scaling these accelerators for higher performance with increasingly larger NNs exacerbates the cost and energy overheads of their memory systems, including the on-chip SRAM buffers and the off-chip DRAM channels. This paper presents the hardware architecture and software scheduling and partitioning techniques for TETRIS, a scalable NN accelerator using 3D memory. First, we show that the high throughput and low energy characteristics of 3D memory allow us to rebalance the NN accelerator design, using more area for processing elements and less area for SRAM buffers. Second, we move portions of the NN computations close to the DRAM banks to decrease bandwidth pressure and increase performance and energy efficiency. Third, we show that despite the use of small SRAM buffers, the presence of 3D memory simplifies dataflow scheduling for NN computations. We present an analytical scheduling scheme that matches the efficiency of schedules derived through exhaustive search. Finally, we develop a hybrid partitioning scheme that parallelizes the NN computations over multiple accelerators. Overall, we show that TETRIS improves the performance by 4.1x and reduces the energy by 1.5x over NN accelerators with conventional, low-power DRAM memory systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Song:2017:HBA, author = "Wonjun Song and Gwangsun Kim and Hyungjoon Jung and Jongwook Chung and Jung Ho Ahn and Jae W. Lee and John Kim", title = "History-Based Arbitration for Fairness in Processor-Interconnect of {NUMA} Servers", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "765--777", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037753", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "NUMA (non-uniform memory access) servers are commonly used in high-performance computing and datacenters. Within each server, a processor-interconnect (e.g., Intel QPI, AMD HyperTransport) is used to communicate between the different sockets or nodes. In this work, we explore the impact of the processor-interconnect on overall performance --- in particular, the performance unfairness caused by processor-interconnect arbitration. It is well known that locally-fair arbitration does not guarantee globally-fair bandwidth sharing as closer nodes receive more bandwidth in a multi-hop network. However, this work demonstrates that the opposite can occur in a commodity NUMA server where remote nodes receive higher bandwidth (and perform better). We analyze this problem and identify that this occurs because of external concentration used in router micro-architectures for processor-interconnects without globally-aware arbitration. While accessing remote memory can occur in any NUMA system, performance unfairness (or performance variation) is more critical in cloud computing and virtual machines with shared resources. We demonstrate how this unfairness creates significant performance variation when a workload is executed on the Xen virtualization platform. We then provide analysis using synthetic workloads to better understand the source of unfairness and eliminate the impact of other shared resources, including the shared last-level cache and main memory. To provide fairness, we propose a novel, history-based arbitration that tracks the history of arbitration grants made in the previous history window. A weighted arbitration is done based on the history to provide global fairness. Through simulations, we show our proposed history-based arbitration can provide global fairness and minimize the processor- interconnect performance unfairness at low cost.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Misra:2017:ELT, author = "Pulkit A. Misra and Jeffrey S. Chase and Johannes Gehrke and Alvin R. Lebeck", title = "Enabling Lightweight Transactions with Precision Time", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "779--794", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037722", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Distributed transactional storage is an important service in today's data centers. Achieving high performance without high complexity is often a challenge for these systems due to sophisticated consistency protocols and multiple layers of abstraction. In this paper we show how to combine two emerging technologies---Software-Defined Flash (SDF) and precise synchronized clocks---to improve performance and reduce complexity for transactional storage within the data center. We present a distributed transactional system (called MILANA) as a layer above a durable multi-version key-value store (called SEMEL) for read-heavy workloads within a data center. SEMEL exploits write behavior of SSDs to maintain a time-ordered sequence of versions for each key efficiently and durably. MILANA adds a variant of optimistic concurrency control above SEMEL's API to service read requests from a consistent snapshot and to enable clients to make fast local commit or abort decisions for read-only transactions. Experiments with the prototype reveal up to 43\% lower transaction abort rates using IEEE Precision Time Protocol (PTP) vs. the standard Network Time Protocol (NTP). Under the Retwis benchmark, client-local validation of read-only transactions yields a 35\% reduction in latency and 55\% increase in transaction throughput.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Liu:2017:ITN, author = "Ming Liu and Liang Luo and Jacob Nelson and Luis Ceze and Arvind Krishnamurthy and Kishore Atreya", title = "{IncBricks}: Toward In-Network Computation with an In-Network Cache", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "795--809", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037731", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The emergence of programmable network devices and the increasing data traffic of datacenters motivate the idea of in-network computation. By offloading compute operations onto intermediate networking devices (e.g., switches, network accelerators, middleboxes), one can (1) serve network requests on the fly with low latency; (2) reduce datacenter traffic and mitigate network congestion; and (3) save energy by running servers in a low-power mode. However, since (1) existing switch technology doesn't provide general computing capabilities, and (2) commodity datacenter networks are complex (e.g., hierarchical fat-tree topologies, multipath communication), enabling in-network computation inside a datacenter is challenging. In this paper, as a step towards in-network computing, we present IncBricks, an in-network caching fabric with basic computing primitives. IncBricks is a hardware-software co-designed system that supports caching in the network using a programmable network middlebox. As a key-value store accelerator, our prototype lowers request latency by over 30\% and doubles throughput for 1024 byte values in a common cluster configuration. Our results demonstrate the effectiveness of in-network computing and that efficient datacenter network request processing is possible if we carefully split the computation across the different programmable computing elements in a datacenter, including programmable switches, network accelerators, and end hosts.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Akturk:2017:AAA, author = "Ismail Akturk and Ulya R. Karpuzcu", title = "{AMNESIAC}: Amnesic Automatic Computer", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "811--824", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037741", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Due to imbalances in technology scaling, the energy consumption of data storage and communication by far exceeds the energy consumption of actual data production, i.e., computation. As a consequence, recomputing data can become more energy efficient than storing and retrieving precomputed data. At the same time, recomputation can relax the pressure on the memory hierarchy and the communication bandwidth. This study hence assesses the energy efficiency prospects of trading computation for communication. We introduce an illustrative proof-of-concept design, identify practical limitations, and provide design guidelines.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Bai:2017:VRE, author = "Yuxin Bai and Victor W. Lee and Engin Ipek", title = "Voltage Regulator Efficiency Aware Power Management", journal = j-COMP-ARCH-NEWS, volume = "45", number = "1", pages = "825--838", month = mar, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3093337.3037717", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Jun 5 18:01:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Conventional off-chip voltage regulators are typically bulky and slow, and are inefficient at exploiting system and workload variability using Dynamic Voltage and Frequency Scaling (DVFS). On-die integration of voltage regulators has the potential to increase the energy efficiency of computer systems by enabling power control at a fine granularity in both space and time. The energy conversion efficiency of on-chip regulators, however, is typically much lower than off-chip regulators, which results in significant energy losses. Fine-grained power control and high voltage regulator efficiency are difficult to achieve simultaneously, with either emerging on-chip or conventional off-chip regulators. A voltage conversion framework that relies on a hierarchy of off-chip switching regulators and on-chip linear regulators is proposed to enable fine-grained power control with a regulator efficiency greater than 90\%. A DVFS control policy that is based on a reinforcement learning (RL) approach is developed to exploit the proposed framework. Per-core RL agents learn and improve their control policies independently, while retaining the ability to coordinate their actions to accomplish system level power management objectives. When evaluated on a mix of 14 parallel and 13 multiprogrammed workloads, the proposed voltage conversion framework achieves 18\% greater energy efficiency than a conventional framework that uses on-chip switching regulators. Moreover, when the RL based DVFS control policy is used to control the proposed voltage conversion framework, the system achieves a 21\% higher energy efficiency over a baseline oracle policy with coarse-grained power control capability.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", remark = "ASPLOS'17 conference proceedings", } @Article{Jouppi:2017:DPA, author = "Norman P. Jouppi and Cliff Young and Nishant Patil and David Patterson and Gaurav Agrawal and Raminder Bajwa and Sarah Bates and Suresh Bhatia and Nan Boden and Al Borchers and Rick Boyle and Pierre-luc Cantin and Clifford Chao and Chris Clark and Jeremy Coriell and Mike Daley and Matt Dau and Jeffrey Dean and Ben Gelb and Tara Vazir Ghaemmaghami and Rajendra Gottipati and William Gulland and Robert Hagmann and C. Richard Ho and Doug Hogberg and John Hu and Robert Hundt and Dan Hurt and Julian Ibarz and Aaron Jaffey and Alek Jaworski and Alexander Kaplan and Harshit Khaitan and Daniel Killebrew and Andy Koch and Naveen Kumar and Steve Lacy and James Laudon and James Law and Diemthu Le and Chris Leary and Zhuyuan Liu and Kyle Lucke and Alan Lundin and Gordon MacKean and Adriana Maggiore and Maire Mahony and Kieran Miller and Rahul Nagarajan and Ravi Narayanaswami and Ray Ni and Kathy Nix and Thomas Norrie and Mark Omernick and Narayana Penukonda and Andy Phelps and Jonathan Ross and Matt Ross and Amir Salek and Emad Samadiani and Chris Severn and Gregory Sizikov and Matthew Snelham and Jed Souter and Dan Steinberg and Andy Swing and Mercedes Tan and Gregory Thorson and Bo Tian and Horia Toma and Erick Tuttle and Vijay Vasudevan and Richard Walter and Walter Wang and Eric Wilcox and Doe Hyun Yoon", title = "In-Datacenter Performance Analysis of a Tensor Processing Unit", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "1--12", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080246", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Many architects believe that major improvements in cost-energy-performance must now come from domain-specific hardware. This paper evaluates a custom ASIC---called a Tensor Processing Unit (TPU) --- deployed in datacenters since 2015 that accelerates the inference phase of neural networks (NN). The heart of the TPU is a 65,536 8-bit MAC matrix multiply unit that offers a peak throughput of 92 TeraOps/second (TOPS) and a large (28 MiB) software-managed on-chip memory. The TPU's deterministic execution model is a better match to the 99th-percentile response-time requirement of our NN applications than are the time-varying optimizations of CPUs and GPUs that help average throughput more than guaranteed latency. The lack of such features helps explain why, despite having myriad MACs and a big memory, the TPU is relatively small and low power. We compare the TPU to a server-class Intel Haswell CPU and an Nvidia K80 GPU, which are contemporaries deployed in the same datacenters. Our workload, written in the high-level TensorFlow framework, uses production NN applications (MLPs, CNNs, and LSTMs) that represent 95\% of our datacenters' NN inference demand. Despite low utilization for some applications, the TPU is on average about 15X --- 30X faster than its contemporary GPU or CPU, with TOPS/Watt about 30X --- 80X higher. Moreover, using the CPU's GDDR5 memory in the TPU would triple achieved TOPS and raise TOPS/Watt to nearly 70X the GPU and 200X the CPU.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Venkataramani:2017:SSC, author = "Swagath Venkataramani and Ashish Ranjan and Subarno Banerjee and Dipankar Das and Sasikanth Avancha and Ashok Jagannathan and Ajaya Durg and Dheemanth Nagaraj and Bharat Kaul and Pradeep Dubey and Anand Raghunathan", title = "{ScaleDeep}: a Scalable Compute Architecture for Learning and Evaluating Deep Networks", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "13--26", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080244", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Deep Neural Networks (DNNs) have demonstrated state-of-the-art performance on a broad range of tasks involving natural language, speech, image, and video processing, and are deployed in many real world applications. However, DNNs impose significant computational challenges owing to the complexity of the networks and the amount of data they process, both of which are projected to grow in the future. To improve the efficiency of DNNs, we propose ScaleDeep, a dense, scalable server architecture, whose processing, memory and interconnect subsystems are specialized to leverage the compute and communication characteristics of DNNs. While several DNN accelerator designs have been proposed in recent years, the key difference is that ScaleDeep primarily targets DNN training, as opposed to only inference or evaluation. The key architectural features from which ScaleDeep derives its efficiency are: (i) heterogeneous processing tiles and chips to match the wide diversity in computational characteristics (FLOPs and Bytes/FLOP ratio) that manifest at different levels of granularity in DNNs, (ii) a memory hierarchy and 3-tiered interconnect topology that is suited to the memory access and communication patterns in DNNs, (iii) a low-overhead synchronization mechanism based on hardware data-flow trackers, and (iv) methods to map DNNs to the proposed architecture that minimize data movement and improve core utilization through nested pipelining. We have developed a compiler to allow any DNN topology to be programmed onto ScaleDeep, and a detailed architectural simulator to estimate performance and energy. The simulator incorporates timing and power models of ScaleDeep's components based on synthesis to Intel's 14nm technology. We evaluate an embodiment of ScaleDeep with 7032 processing tiles that operates at 600 MHz and has a peak performance of 680 TFLOPs (single precision) and 1.35 PFLOPs (half-precision) at 1.4KW. Across 11 state-of-the-art DNNs containing 0.65M-14.9M neurons and 6.8M-145.9M weights, including winners from 5 years of the ImageNet competition, ScaleDeep demonstrates 6x-28x speedup at iso-power over the state-of-the-art performance on GPUs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Parashar:2017:SAC, author = "Angshuman Parashar and Minsoo Rhu and Anurag Mukkara and Antonio Puglielli and Rangharajan Venkatesan and Brucek Khailany and Joel Emer and Stephen W. Keckler and William J. Dally", title = "{SCNN}: an Accelerator for Compressed-sparse Convolutional Neural Networks", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "27--40", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080254", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Convolutional Neural Networks (CNNs) have emerged as a fundamental technology for machine learning. High performance and extreme energy efficiency are critical for deployments of CNNs, especially in mobile platforms such as autonomous vehicles, cameras, and electronic personal assistants. This paper introduces the Sparse CNN (SCNN) accelerator architecture, which improves performance and energy efficiency by exploiting the zero-valued weights that stem from network pruning during training and zero-valued activations that arise from the common ReLU operator. Specifically, SCNN employs a novel dataflow that enables maintaining the sparse weights and activations in a compressed encoding, which eliminates unnecessary data transfers and reduces storage requirements. Furthermore, the SCNN dataflow facilitates efficient delivery of those weights and activations to a multiplier array, where they are extensively reused; product accumulation is performed in a novel accumulator array. On contemporary neural networks, SCNN can improve both performance and energy by a factor of 2.7x and 2.3x, respectively, over a comparably provisioned dense CNN accelerator.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Cherupalli:2017:BPA, author = "Hari Cherupalli and Henry Duwe and Weidong Ye and Rakesh Kumar and John Sartori", title = "Bespoke Processors for Applications with Ultra-low Area and Power Constraints", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "41--54", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080247", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A large number of emerging applications such as implantables, wearables, printed electronics, and IoT have ultra-low area and power constraints. These applications rely on ultra-low-power general purpose microcontrollers and microprocessors, making them the most abundant type of processor produced and used today. While general purpose processors have several advantages, such as amortized development cost across many applications, they are significantly over-provisioned for many area- and power-constrained systems, which tend to run only one or a small number of applications over their lifetime. In this paper, we make a case for bespoke processor design, an automated approach that tailors a general purpose processor IP to a target application by removing all gates from the design that can never be used by the application. Since removed gates are never used by an application, bespoke processors can achieve significantly lower area and power than their general purpose counterparts without any performance degradation. Also, gate removal can expose additional timing slack that can be exploited to increase area and power savings or performance of a bespoke design. Bespoke processor design reduces area and power by 62\% and 50\%, on average, while exploiting exposed timing slack improves average power savings to 65\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:2017:PGF, author = "Yajing Chen and Shengshuo Lu and Cheng Fu and David Blaauw and Ronald {Dreslinski, Jr.} and Trevor Mudge and Hun-Seok Kim", title = "A Programmable {Galois} Field Processor for the {Internet of Things}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "55--68", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080227", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper investigates the feasibility of a unified processor architecture to enable error coding flexibility and secure communication in low power Internet of Things (IoT) wireless networks. Error coding flexibility for wireless communication allows IoT applications to exploit the large tradeoff space in data rate, link distance and energy-efficiency. As a solution, we present a light-weight Galois Field (GF) processor to enable energy-efficient block coding and symmetric/asymmetric cryptography kernel processing for a wide range of GF sizes (2m, m = 2, 3, ..., 233) and arbitrary irreducible polynomials. Program directed connections among primitive GF arithmetic units enable dynamically configured parallelism to efficiently perform either four-way SIMD 5- to 8-bit GF operations, including multiplicative inverse, or a wide bit-width (e.g., 32-bit) GF product in a single cycle. To illustrate our ideas, we synthesized our GF processor in a 28nm technology. Compared to a baseline software implementation optimized for a general purpose ARM M0+ processor, our processor exhibits a 5-20 x speedup for a range of error correction codes and symmetric/asymmetric cryptography applications. Additionally, our proposed GF processor consumes 431 $ \mu $W at 0.9V and 100MHz, and achieves 35.5pJ/b energy efficiency while executing AES operations at 12.2Mbps. We achieve this within an area of 0.01mm2.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wang:2017:XCE, author = "Aosen Wang and Lizhong Chen and Wenyao Xu", title = "{XPro}: a Cross-End Processing Architecture for Data Analytics in Wearables", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "69--80", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080219", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Wearable computing systems have spurred many opportunities to continuously monitor human bodies with sensors worn on or implanted in the body. These emerging platforms have started to revolutionize many fields, including healthcare and wellness applications, particularly when integrated with intelligent analytic capabilities. However, a significant challenge that computer architects are facing is how to embed sophisticated analytic capabilities in wearable computers in an energy-efficient way while not compromising system performance. In this paper, we present XPro, a novel cross-end analytic engine architecture for wearable computing systems. The proposed cross-end architecture is able to realize a generic classification design across wearable sensors and a data aggregator with high energy-efficiency. To facilitate the practical use of XPro, we also develop an Automatic XPro Generator that formally generates XPro instances according to specific design constraints. As a proof of concept, we study the design and implementation of XPro with six different health applications. Evaluation results show that, compared with state-of-the-art methods, XPro can increase the battery life of the sensor node by 1.6-2.4X while at the same time reducing system delay by 15.6-60.8\% for wearable computing systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Weisse:2017:RLC, author = "Ofir Weisse and Valeria Bertacco and Todd Austin", title = "Regaining Lost Cycles with {HotCalls}: a Fast Interface for {SGX} Secure Enclaves", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "81--93", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080208", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Intel's SGX secure execution technology allows running computations on secret data using untrusted servers. While recent work showed how to port applications and large-scale computations to run under SGX, the performance implications of using the technology remains an open question. We present the first comprehensive quantitative study to evaluate the performance of SGX. We show that straightforward use of SGX library primitives for calling functions add between 8,200 --- 17,000 cycles overhead, compared to 150 cycles of a typical system call. We quantify the performance impact of these library calls and show that in applications with high system calls frequency, such as memcached, openVPN, and lighttpd, which all have high bandwidth network requirements, the performance degradation may be as high as 79\%. We investigate the sources of this performance degradation by leveraging a new set of microbenchmarks for SGX-specific operations such as enclave entry-calls and out-calls, and encrypted memory I/O accesses. We leverage the insights we gain from these analyses to design a new SGX interface framework HotCalls. HotCalls are based on a synchronization spin-lock mechanism and provide a 13-27x speedup over the default interface. It can easily be integrated into existing code, making it a practical solution. Compared to a baseline SGX implementation of memcached, openVPN, and lighttpd --- we show that using the new interface boosts the throughput by 2.6-3.7x, and reduces application latency by 62-74\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Aga:2017:ISM, author = "Shaizeen Aga and Satish Narayanasamy", title = "{InvisiMem}: Smart Memory Defenses for Memory Bus Side Channel", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "94--106", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080232", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "A practically feasible low-overhead hardware design that provides strong defenses against memory bus side channel remains elusive. This paper observes that smart memory, memory with compute capability and a packetized interface, can dramatically simplify this problem. InvisiMem expands the trust base to include the logic layer in the smart memory to implement cryptographic primitives, which aid in addressing several memory bus side channel vulnerabilities efficiently. This allows the secure host processor to send encrypted addresses over the untrusted memory bus, and thereby eliminates the need for expensive address obfuscation techniques based on Oblivious RAM (ORAM). In addition, smart memory enables efficient solutions for ensuring freshness without using expensive Merkle trees, and mitigates memory bus timing channel using constant heart-beat packets. We demonstrate that InvisiMem designs have one to two orders of magnitude of lower overheads for performance, space, energy, and memory bandwidth, compared to prior solutions.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Awad:2017:OLO, author = "Amro Awad and Yipeng Wang and Deborah Shands and Yan Solihin", title = "{ObfusMem}: a Low-Overhead Access Obfuscation for Trusted Memories", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "107--119", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080230", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Trustworthy software requires strong privacy and security guarantees from a secure trust base in hardware. While chipmakers provide hardware support for basic security and privacy primitives such as enclaves and memory encryption. these primitives do not address hiding of the memory access pattern, information about which may enable attacks on the system or reveal characteristics of sensitive user data. State-of-the-art approaches to protecting the access pattern are largely based on Oblivious RAM (ORAM). Unfortunately, current ORAM implementations suffer from very significant practicality and overhead concerns, including roughly an order of magnitude slowdown, more than 100\% memory capacity overheads, and the potential for system deadlock. Memory technology trends are moving towards 3D and 2.5D integration, enabling significant logic capabilities and sophisticated memory interfaces. Leveraging the trends, we propose a new approach to access pattern obfuscation, called ObfusMem. ObfusMem adds the memory to the trusted computing base and incorporates cryptographic engines within the memory. ObfusMem encrypts commands and addresses on the memory bus, hence the access pattern is cryptographically obfuscated from external observers. Our evaluation shows that ObfusMem incurs an overhead of 10.9\% on average, which is about an order of magnitude faster than ORAM implementations. Furthermore, ObfusMem does not incur capacity overheads and does not amplify writes. We analyze and compare the security protections provided by ObfusMem and ORAM, and highlight their differences.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Khatamifard:2017:TTA, author = "S. Karen Khatamifard and Longfei Wang and Weize Yu and Sel{\c{c}}uk K{\"o}se and Ulya R. Karpuzcu", title = "{ThermoGater}: Thermally-Aware On-Chip Voltage Regulation", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "120--132", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080250", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Tailoring the operating voltage to fine-grain temporal changes in the power and performance needs of the workload can effectively enhance power efficiency. Therefore, power-limited computing platforms of today widely deploy integrated (i.e., on-chip) voltage regulation which enables fast fine-grain voltage control. Voltage regulators convert and distribute power from an external energy source to the processor. Unfortunately, power conversion loss is inevitable and projected integrated regulator designs are unlikely to eliminate this loss even asymptotically. Reconfigurable power delivery by selective shut-down, i.e., gating, of distributed on-chip regulators in response to spatio-temporal changes in power demand can sustain operation at the minimum conversion loss. However, even the minimum conversion loss is sizable, and as conversion loss gets dissipated as heat, on-chip regulators can easily cause thermal emergencies due to their small footprint. Although reconfigurable distributed on-chip power delivery is emerging as a new design paradigm to enforce sustained operation at minimum possible power conversion loss, thermal implications have been overlooked at the architectural level. This paper hence provides a thermal characterization. We introduce ThermoGater, an architectural governor for a collection of practical, thermally-aware regulator gating policies to mitigate (if not prevent) regulator-induced thermal emergencies, which also consider potential implications for voltage noise. Practical ThermoGater policies can not only sustain minimum power conversion loss throughout execution effectively, but also keep the maximum temperature (thermal gradient) across chip within 0.6${}^\circ $C (0.3${}^\circ $C) on average in comparison to thermally-optimal oracular regulator gating, while the maximum voltage noise stays within 1.0\% of the best case voltage noise profile.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yang:2017:PIP, author = "Hailong Yang and Quan Chen and Moeiz Riaz and Zhongzhi Luan and Lingjia Tang and Jason Mars", title = "{PowerChief}: Intelligent Power Allocation for Multi-Stage Applications to Improve Responsiveness on Power Constrained {CMP}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "133--146", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080224", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Modern user facing applications consist of multiple processing stages with a number of service instances in each stage. The latency profile of these multi-stage applications is intrinsically variable, making it challenging to provide satisfactory responsiveness. Given a limited power budget, improving the end-to-end latency requires intelligently boosting the bottleneck service across stages using multiple boosting techniques. However, prior work fail to acknowledge the multi-stage nature of user-facing applications and perform poorly in improving responsiveness on power constrained CMP, as they are unable to accurately identify bottleneck service and apply the boosting techniques adaptively. In this paper, we present PowerChief, a runtime framework that (1) provides joint design of service and query to monitor the latency statistics across service stages and accurately identifies the bottleneck service during runtime; (2) adaptively chooses the boosting technique to accelerate the bottleneck service with improved responsiveness; (3) dynamically reallocates the constrained power budget across service stages to accommodate the chosen boosting technique. Evaluated with real world multi-stage applications, PowerChief improves the average latency by 20.3x and 32.4x (99\% tail latency by 13.3x and 19.4x) for Sirius and Natural Language Processing applications respectively compared to stage-agnostic power allocation. In addition, for the given QoS target, PowerChief reduces the power consumption of Sirius and Web Search applications by 23\% and 33\% respectively over prior work.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ravi:2017:CCH, author = "Gokul Subramanian Ravi and Mikko H. Lipasti", title = "{CHARSTAR: Clock Hierarchy Aware Resource Scaling in Tiled ARchitectures}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "147--160", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080212", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "High-performance architectures are over-provisioned with resources to extract the maximum achievable performance out of applications. Two sources of avoidable power dissipation are the leakage power from underutilized resources, along with clock power from the clock hierarchy that feeds these resources. Most reconfiguration mechanisms either focus solely on power gating execution resources alone or in addition, simply turn off the immediate clock tree segment which supplied the clock to those resources. These proposals neither attempt to gate further up the clock hierarchy nor do they involve the clock hierarchy in influencing the reconfiguration decisions. The primary contribution of CHARSTAR is optimizing reconfiguration mechanisms to become clock hierarchy aware. Resource gating decisions are cognizant of the power consumed by each node in the clock hierarchy and additionally, entire branches of the clock tree are greedily shut down whenever possible. The CHARSTAR design is further optimized for balanced spatio-temporal reconfiguration and also enables efficient joint control of resource and frequency scaling. The proposal is implemented by leveraging the inherent advantages of spatial architectures, utilizing a control mechanism driven by a lightweight offline trained neural predictor. CHARSTAR, when deployed on the CRIB tiled microarchitecture, improves processor energy efficiency by 20-25\%, with efficiency improvements of roughly 2x in comparison to a naive power gating mechanism. Alternatively, it improves performance by 10-20\% under varying power and energy constraints.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sinclair:2017:CRS, author = "Matthew D. Sinclair and Johnathan Alsop and Sarita V. Adve", title = "Chasing Away {RAts}: Semantics and Evaluation for Relaxed Atomics on Heterogeneous Systems", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "161--174", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080206", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "An unambiguous and easy-to-understand memory consistency model is crucial for ensuring correct synchronization and guiding future design of heterogeneous systems. In a widely adopted approach, the memory model guarantees sequential consistency (SC) as long as programmers obey certain rules. The popular data-race-free-0 (DRF0) model exemplifies this SC-centric approach by requiring programmers to avoid data races. Recent industry models, however, have extended such SC-centric models to incorporate relaxed atomics. These extensions can improve performance, but are difficult to specify formally and use correctly. This work addresses the impact of relaxed atomics on consistency models for heterogeneous systems in two ways. First, we introduce a new model, Data-Race-Free-Relaxed (DRFrlx), that extends DRF0 to provide SC-centric semantics for the common use cases of relaxed atomics. Second, we evaluate the performance of relaxed atomics in CPU-GPU systems for these use cases. We find mixed results --- for most cases, relaxed atomics provide only a small benefit in execution time, but for some cases, they help significantly (e.g., up to 51\% for DRFrlx over DRF0).", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shin:2017:HLL, author = "Seunghee Shin and James Tuck and Yan Solihin", title = "Hiding the Long Latency of Persist Barriers Using Speculative Execution", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "175--186", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080240", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Byte-addressable non-volatile memory technology is emerging as an alternative for DRAM for main memory. This new Non-Volatile Main Memory (NVMM) allows programmers to store important data in data structures in memory instead of serializing it to the file system, thereby providing a substantial performance boost. However, modern systems reorder memory operations and utilize volatile caches for better performance, making it difficult to ensure a consistent state in NVMM. Intel recently announced a new set of persistence instructions, clflushopt, clwb, and pcommit. These new instructions make it possible to implement fail-safe code on NVMM, but few workloads have been written or characterized using these new instructions. In this work, we describe how these instructions work and how they can be used to implement write-ahead logging based transactions. We implement several common data structures and kernels and evaluate the performance overhead incurred over traditional non-persistent implementations. In particular, we find that persistence instructions occur in clusters along with expensive fence operations, they have long latency, and they add a significant execution time overhead, on average by 20.3\% over code with logging but without fence instructions to order persists. To deal with this overhead and alleviate the performance bottleneck, we propose to speculate past long latency persistency operations using checkpoint-based processing. Our speculative persistence architecture reduces the execution time overheads to only 3.6\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ros:2017:NSL, author = "Alberto Ros and Trevor E. Carlson and Mehdi Alipour and Stefanos Kaxiras", title = "Non-Speculative Load-Load Reordering in {TSO}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "187--200", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080220", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In Total Store Order memory consistency (TSO), loads can be speculatively reordered to improve performance. If a load-load reordering is seen by other cores, speculative loads must be squashed and re-executed. In architectures with an unordered interconnection network and directory coherence, this has been the established view for decades. We show, for the first time, that it is not necessary to squash and re-execute speculatively reordered loads in TSO when their reordering is seen. Instead, the reordering can be hidden form other cores by the coherence protocol. The implication is that we can irrevocably bind speculative loads. This allows us to commit reordered loads out-of-order without having to wait (for the loads to become non-speculative) or without having to checkpoint committed state (and rollback if needed), just to ensure correctness in the rare case of some core seeing the reordering. We show that by exposing a reordering to the coherence layer and by appropriately modifying a typical directory protocol we can successfully hide load-load reordering without perceptible performance cost and without deadlock. Our solution is cost-effective and increases the performance of out-of-order commit by a sizable margin, compared to the base case where memory operations are not allowed to commit if the consistency model could be violated.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Lee:2017:MVN, author = "Doowon Lee and Valeria Bertacco", title = "{MTraceCheck}: Validating Non-Deterministic Behavior of Memory Consistency Models in Post-Silicon Validation", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "201--213", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080235", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This work presents a minimally-intrusive, high-performance, post-silicon validation framework for validating memory consistency in multi-core systems. Our framework generates constrained-random tests that are instrumented with observability-enhancing code for memory consistency verification. For each test, we generate a set of compact signatures reflecting the memory-ordering patterns observed over many executions of the test, with each of the signatures corresponding to a unique memory-ordering pattern. We then leverage an efficient and novel analysis to quickly determine if the observed execution patterns represented by each unique signature abide by the memory consistency model. Our analysis derives its efficiency by exploiting the structural similarities among the patterns observed. We evaluated our framework, MTraceCheck, on two platforms: an x86-based desktop and an ARM-based SoC platform, both running multi-threaded test programs in a bare-metal environment. We show that MTraceCheck reduces the perturbation introduced by the memory-ordering monitoring activity by 93\% on average, compared to a baseline register flushing approach that saves the register's state after each load operation. We also reduce the computation requirements of our consistency checking analysis by 81\% on average, compared to a conventional topological sorting solution. We finally demonstrate the effectiveness of MTraceCheck on buggy designs, by evaluating multiple case studies where it successfully exposes subtle bugs in a full-system simulation environment.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Zheng:2017:RMA, author = "Ruohuang Zheng and Michael C. Huang", title = "Redundant Memory Array Architecture for Efficient Selective Protection", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "214--227", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080213", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Memory hardware errors may result from transient particle-induced faults as well as device defects due to aging. These errors are an important threat to computer system reliability as VLSI technologies continue to scale. Managing memory hardware errors is a critical component in developing an overall system dependability strategy. Memory error detection and correction are supported in a range of available hardware mechanisms. However, memory protections (particularly the more advanced ones) come at substantial costs in performance and energy usage. Moreover, the protection mechanisms are often a fixed, system-wide choice and can not easily adapt to different protection demand of different applications or memory regions. In this paper, we present a new RAIM (redundant array of independent memory) design that compared to the state-of-the-art implementation can easily provide high protection capability and the ability to selectively protect a subset of the memory. A straightforward implementation of the design can incur a substantial memory traffic overhead. We propose a few practical optimizations to mitigate this overhead. With these optimizations the proposed RAIM design offers significant advantages over existing RAIM design at lower or comparable costs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Hicks:2017:CAS, author = "Matthew Hicks", title = "{Clank}: Architectural Support for Intermittent Computation", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "228--240", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080238", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The processors that drive embedded systems are getting smaller; meanwhile, the batteries used to provide power to those systems have stagnated. If we are to realize the dream of ubiquitous computing promised by the Internet of Things, processors must shed large, heavy, expensive, and high maintenance batteries and, instead, harvest energy from their environment. One challenge with this transition is that harvested energy is insufficient for continuous operation. Unfortunately, existing programs fail miserably when executed intermittently. This paper presents Clank: lightweight architectural support for correct and efficient execution of long-running applications on harvested energy---without programmer intervention or extreme hardware modifications. Clank is a set of hardware buffers and memory-access monitors that dynamically maintain idempotency. Essentially, Clank dynamically decomposes program execution into a stream of restartable sub-executions connected via lightweight checkpoints. To validate Clank's ability to correctly stretch program execution across frequent, random power cycles, and to explore the associated hardware and software overheads, we implement Clank in Verilog, formally verify it, and then add it to an ARM Cortex M0+ processor which we use to run a set of 23 embedded systems benchmarks. Experiments show run-time overheads as low as 2.5\%, with run-time overheads of 6\% for a version of Clank that adds 1.7\% hardware. Clank minimizes checkpoints so much that re-execution time becomes the dominate contributor to run-time overhead.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kaliorakis:2017:MED, author = "Manolis Kaliorakis and Dimitris Gizopoulos and Ramon Canal and Antonio Gonzalez", title = "{MeRLiN}: Exploiting Dynamic Instruction Behavior for Fast and Accurate Microarchitecture Level Reliability Assessment", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "241--254", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080225", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Early reliability assessment of hardware structures using microarchitecture level simulators can effectively guide major error protection decisions in microprocessor design. Statistical fault injection on microarchitectural structures modeled in performance simulators is an accurate method to measure their Architectural Vulnerability Factor (AVF) but requires excessively long campaigns to obtain high statistical significance. We propose MeRLiN1, a methodology to boost microarchitecture level injection-based reliability assessment by several orders of magnitude and keep the accuracy of the assessment unaffected even for large injection campaigns with very high statistical significance. The core of MeRLiN is the grouping of faults of an initial list in equivalent classes. All faults in the same group target equivalent vulnerable intervals of program execution ending up to the same static instruction that reads the faulty entries. Faults in the same group occur in different times and entries of a structure and it is extremely likely that they all have the same effect in program execution; thus, fault injection is performed only on a few representatives from each group. We evaluate MeRLiN for different sizes of the physical register file, the store queue and the first level data cache of a contemporary microarchitecture running MiBench and SPEC CPU2006 benchmarks. For all our experiments, MeRLiN is from 2 to 3 orders of magnitude faster than an extremely high statistical significant injection campaign, reporting the same reliability measurements with negligible loss of accuracy. Finally, we theoretically analyze MeRLiN's statistical behavior to further justify its accuracy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Patel:2017:RPR, author = "Minesh Patel and Jeremie S. Kim and Onur Mutlu", title = "The Reach Profiler {(REAPER)}: Enabling the Mitigation of {DRAM} Retention Failures via Profiling at Aggressive Conditions", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "255--268", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080242", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Modern DRAM-based systems suffer from significant energy and latency penalties due to conservative DRAM refresh standards. Volatile DRAM cells can retain information across a wide distribution of times ranging from milliseconds to many minutes, but each cell is currently refreshed every 64ms to account for the extreme tail end of the retention time distribution, leading to a high refresh overhead. Due to poor DRAM technology scaling, this problem is expected to get worse in future device generations. Hence, the current approach of refreshing all cells with the worst-case refresh rate must be replaced with a more intelligent design. Many prior works propose reducing the refresh overhead by extending the default refresh interval to a higher value, which we refer to as the target refresh interval, across parts or all of a DRAM chip. These proposals handle the small set of failing cells that cannot retain data throughout the entire extended refresh interval via retention failure mitigation mechanisms (e.g., error correcting codes or bit-repair mechanisms). This set of failing cells is discovered via retention failure profiling, which is currently a brute-force process that writes a set of known data to DRAM, disables refresh and waits for the duration of the target refresh interval, and then checks for retention failures across the DRAM chip. We show that this brute-force approach is too slow and is detrimental to system execution, especially with frequent online profiling. This paper presents reach profiling, a new methodology for retention failure profiling based on the key observation that an overwhelming majority of failing DRAM cells at a target refresh interval fail more reliably at both longer refresh intervals and higher temperatures. Using 368 state-of-the-art LPDDR4 DRAM chips from three major vendors, we conduct a thorough experimental characterization of the complex set of tradeoffs inherent in the profiling process. We identify three key metrics to guide design choices for retention failure profiling and mitigation mechanisms: coverage, false positive rate, and runtime. We propose reach profiling, a new retention failure profiling mechanism whose key idea is to profile failing cells at a longer refresh interval and/or higher temperature relative to the target conditions in order to maximize failure coverage while minimizing the false positive rate and profiling runtime. We thoroughly explore the tradeoffs associated with reach profiling and show that there is significant room for improvement in DRAM retention failure profiling beyond the brute-force approach. We show with experimental data that on average, by profiling at 250ms above the target refresh interval, our first implementation of reach profiling (called REAPER) can attain greater than 99\% coverage of failing DRAM cells with less than a 50\% false positive rate while running 2.5x faster than the brute-force approach. In addition, our end-to-end evaluations show that REAPER enables significant system performance improvement and DRAM power reduction, outperforming the brute-force approach and enabling high-performance operation at longer refresh intervals that were previously unreasonable to employ due to the high associated profiling overhead.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wang:2017:QSS, author = "Zhenning Wang and Jun Yang and Rami Melhem and Bruce Childers and Youtao Zhang and Minyi Guo", title = "Quality of Service Support for Fine-Grained Sharing on {GPUs}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "269--281", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080203", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "GPUs have been widely adopted in data centers to provide acceleration services to many applications. Sharing a GPU is increasingly important for better processing throughput and energy efficiency. However, quality of service (QoS) among concurrent applications is minimally supported. Previous efforts are too coarse-grained and not scalable with increasing QoS requirements. We propose QoS mechanisms for a fine-grained form of GPU sharing. Our QoS support can provide control over the progress of kernels on a per cycle basis and the amount of thread-level parallelism of each kernel. Due to accurate resource management, our QoS support has significantly better scalability compared with previous best efforts. Evaluations show that, when the GPU is shared by three kernels, two of which have QoS goals, the proposed techniques achieve QoS goals 43.8\% more often than previous techniques and have 20.5\% higher throughput.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Chen:2017:AGH, author = "Sui Chen and Lu Peng and Samuel Irving", title = "Accelerating {GPU} Hardware Transactional Memory with Snapshot Isolation", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "282--294", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080204", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Snapshot Isolation (SI) is an established model in the database community, which permits write-read conflicts to pass and aborts transactions only on write-write conflicts. With the Write Skew anomaly correctly eliminated, SI can reduce the occurrence of aborts, save the work done by transactions, and greatly benefit long transactions involving complex data structures. GPUs are evolving towards a general-purpose computing device with growing support for irregular workloads, including transactional memory. The usage of snapshot isolation on transactional memory has proven to be greatly beneficial for performance. In this paper, we propose a multi-versioned memory subsystem for hardware-based transactional memory on the GPU, with a method for eliminating the Write Skew anomaly on the fly, and finally incorporate Snapshot Isolation with this system. The results show that snapshot isolation can effectively boost the performance of dynamically sized data structures such as linked lists, binary trees and red-black trees, sometimes by as much as 4.5x, which results in improved overall performance of benchmarks utilizing these data structures.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Wang:2017:DAC, author = "Kai Wang and Calvin Lin", title = "Decoupled Affine Computation for {SIMT GPUs}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "295--306", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080205", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper introduces a method of decoupling affine computations---a class of expressions that produces extremely regular values across SIMT threads---from the main execution stream, so that the affine computations can be performed with greater efficiency and with greater independence from the main execution stream. This decoupling has two benefits: (1) For compute-bound programs, it significantly reduces the dynamic warp instruction count; (2) for memory-bound workloads, it significantly reduces memory latency, since it acts as a non-speculative prefetcher for the data specified by the many memory address calculations that are affine computations. We evaluate our solution, known as Decoupled Affine Computation (DAC), using GPGPU-sim and a set of 29 GPGPU programs. We find that on average, DAC improves performance by 40\% and reduces energy consumption by 20\%. For the 11 compute-bound benchmarks, DAC improves performance by 34\%, compared with 11\% for the previous state-of-the-art. For the 18 memory-bound programs, DAC improves performance by an average of 44\%, compared with 16\% for state-of-the-art GPU prefetcher.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Koo:2017:APA, author = "Gunjae Koo and Yunho Oh and Won Woo Ro and Murali Annavaram", title = "Access Pattern-Aware Cache Management for Improving Data Utilization in {GPU}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "307--319", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080239", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Long latency of memory operation is a prominent performance bottleneck in graphics processing units (GPUs). The small data cache that must be shared across dozens of warps (a collection of threads) creates significant cache contention and premature data eviction. Prior works have recognized this problem and proposed warp throttling which reduces the number of active warps contending for cache space. In this paper we discover that individual load instructions in a warp exhibit four different types of data locality behavior: (1) data brought by a warp load instruction is used only once, which is classified as streaming data (2) data brought by a warp load is reused multiple times within the same warp, called intra-warp locality (3) data brought by a warp is reused multiple times but across different warps, called inter-warp locality (4) and some data exhibit both a mix of intra- and inter-warp locality. Furthermore, each load instruction exhibits consistently the same locality type across all warps within a GPU kernel. Based on this discovery we argue that cache management must be done using per-load locality type information, rather than applying warp-wide cache management policies. We propose Access Pattern-aware Cache Management (APCM), which dynamically detects the locality type of each load instruction by monitoring the accesses from one exemplary warp. APCM then uses the detected locality type to selectively apply cache bypassing and cache pinning of data based on load locality characterization. Using an extensive set of simulations we show that APCM improves performance of GPUs by 34\% for cache sensitive applications while saving 27\% of energy consumption over baseline GPU.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Arunkumar:2017:MGM, author = "Akhil Arunkumar and Evgeny Bolotin and Benjamin Cho and Ugljesa Milic and Eiman Ebrahimi and Oreste Villa and Aamer Jaleel and Carole-Jean Wu and David Nellans", title = "{MCM-GPU}: Multi-Chip-Module {GPUs} for Continued Performance Scalability", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "320--332", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080231", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Historically, improvements in GPU-based high performance computing have been tightly coupled to transistor scaling. As Moore's law slows down, and the number of transistors per die no longer grows at historical rates, the performance curve of single monolithic GPUs will ultimately plateau. However, the need for higher performing GPUs continues to exist in many domains. To address this need, in this paper we demonstrate that package-level integration of multiple GPU modules to build larger logical GPUs can enable continuous performance scaling beyond Moore's law. Specifically, we propose partitioning GPUs into easily manufacturable basic GPU Modules (GPMs), and integrating them on package using high bandwidth and power efficient signaling technologies. We lay out the details and evaluate the feasibility of a basic Multi-Chip-Module GPU (MCM-GPU) design. We then propose three architectural optimizations that significantly improve GPM data locality and minimize the sensitivity on inter-GPM bandwidth. Our evaluation shows that the optimized MCM-GPU achieves 22.8\% speedup and 5x inter-GPM bandwidth reduction when compared to the basic MCM-GPU architecture. Most importantly, the optimized MCM-GPU design is 45.5\% faster than the largest implementable monolithic GPU, and performs within 10\% of a hypothetical (and unbuildable) monolithic GPU. Lastly we show that our optimized MCM-GPU is 26.8\% faster than an equally equipped Multi-GPU system with the same total number of SMs and DRAM bandwidth.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nazari:2017:EEB, author = "Alireza Nazari and Nader Sehatbakhsh and Monjur Alam and Alenka Zajic and Milos Prvulovic", title = "{EDDIE}: {EM}-Based Detection of Deviations in Program Execution", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "333--346", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080223", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper describes EM-Based Detection of Deviations in Program Execution (EDDIE), a new method for detecting anomalies in program execution, such as malware and other code injections, without introducing any overheads, adding any hardware support, changing any software, or using any resources on the monitored system itself. Monitoring with EDDIE involves receiving electromagnetic (EM) emanations that are emitted as a side effect of execution on the monitored system, and it relies on spikes in the EM spectrum that are produced as a result of periodic (e.g. loop) activity in the monitored execution. During training, EDDIE characterizes normal execution behavior in terms of peaks in the EM spectrum that are observed at various points in the program execution, but it does not need any characterization of the malware or other code that might later be injected. During monitoring, EDDIE identifies peaks in the observed EM spectrum, and compares these peaks to those learned during training. Since EDDIE requires no resources on the monitored machine and no changes to the monitored software, it is especially well suited for security monitoring of embedded and IoT devices. We evaluate EDDIE on a real IoT system and in a cycle-accurate simulator, and find that even relatively brief injected bursts of activity (a few milliseconds) are detected by EDDIE with high accuracy, and that it also accurately detects when even a few instructions are injected into an existing loop within the application.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yan:2017:SHA, author = "Mengjia Yan and Bhargava Gopireddy and Thomas Shull and Josep Torrellas", title = "Secure Hierarchy-Aware Cache Replacement Policy {(SHARP)}: Defending Against Cache-Based Side Channel Atacks", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "347--360", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080222", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "In cache-based side channel attacks, a spy that shares a cache with a victim probes cache locations to extract information on the victim's access patterns. For example, in evict+reload, the spy repeatedly evicts and then reloads a probe address, checking if the victim has accessed the address in between the two operations. While there are many proposals to combat these cache attacks, they all have limitations: they either hurt performance, require programmer intervention, or can only defend against some types of attacks. This paper makes the following observation for an environment with an inclusive cache hierarchy: when the spy evicts the probe address from the shared cache, the address will also be evicted from the private cache of the victim process, creating an inclusion victim. Consequently, to disable cache attacks, this paper proposes to alter the line replacement algorithm of the shared cache, to prevent a process from creating inclusion victims in the caches of cores running other processes. By enforcing this rule, the spy cannot evict the probe address from the shared cache and, hence, cannot glimpse any information on the victim's access patterns. We call our proposal SHARP (Secure Hierarchy-Aware cache Replacement Policy). SHARP efficiently defends against all existing cross-core shared-cache attacks, needs only minimal hardware modifications, and requires no code modifications. We implement SHARP in a cycle-level full-system simulator. We show that it protects against real-world attacks, and that it introduces negligible average performance degradation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Deng:2017:LLH, author = "Zhaoxia Deng and Ariel Feldman and Stuart A. Kurtz and Frederic T. Chong", title = "Lemonade from Lemons: Harnessing Device Wearout to Create Limited-Use Security Architectures", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "361--374", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080226", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Most architectures are designed to mitigate the usually undesirable phenomenon of device wearout. We take a contrarian view and harness this phenomenon to create hardware security mechanisms that resist attacks by statistically enforcing an upper bound on hardware uses, and consequently attacks. For example, let us assume that a user may log into a smartphone a maximum of 50 times a day for 5 years, resulting in approximately 91,250 legitimate uses. If we assume at least 8-character passwords and we require login (and retrieval of the storage decryption key) to traverse hardware that wears out in 91,250 uses, then an adversary has a negligible chance of successful brute-force attack before the hardware wears out, even assuming real-world password cracking by professionals. M-way replication of our hardware and periodic re-encryption of storage can increase the daily usage bound by a factor of M. The key challenge is to achieve practical statistical bounds on both minimum and maximum uses for an architecture, given that individual devices can vary widely in wearout characteristics. We introduce techniques for architecturally controlling these bounds and perform a design space exploration for three use cases: a limited-use connection, a limited-use targeting system and one-time pads. These techniques include decision trees, parallel structures, Shamir's secret-sharing mechanism, Reed--Solomon codes, and module replication. We explore the cost in area, energy and latency of using these techniques to achieve system-level usage targets given device-level wearout distributions. With redundant encoding, for example, we can improve exponential sensitivity to device lifetime variation to linear sensitivity, reducing the total number of NEMS devices by 4 orders of magnitude to about 0.8 million for limited-use connections (compared with 4 billion if without redundant encoding).", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Altaf:2017:LHL, author = "Muhammad Shoaib Bin Altaf and David A. Wood", title = "{LogCA}: a High-Level Performance Model for Hardware Accelerators", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "375--388", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080216", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With the end of Dennard scaling, architects have increasingly turned to special-purpose hardware accelerators to improve the performance and energy efficiency for some applications. Unfortunately, accelerators don't always live up to their expectations and may under-perform in some situations. Understanding the factors which effect the performance of an accelerator is crucial for both architects and programmers early in the design stage. Detailed models can be highly accurate, but often require low-level details which are not available until late in the design cycle. In contrast, simple analytical models can provide useful insights by abstracting away low-level system details. In this paper, we propose LogCA---a high-level performance model for hardware accelerators. LogCA helps both programmers and architects identify performance bounds and design bottlenecks early in the design cycle, and provide insight into which optimizations may alleviate these bottlenecks. We validate our model across a variety of kernels, ranging from sub-linear to super-linear complexities on both on-chip and off-chip accelerators. We also describe the utility of LogCA using two retrospective case studies. First, we discuss the evolution of interface design in SUN/Oracle's encryption accelerators. Second, we discuss the evolution of memory interface design in three different GPU architectures. In both cases, we show that the adopted design optimizations for these machines are similar to LogCA's suggested optimizations. We argue that architects and programmers can use insights from these retrospective studies for improving future designs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Prabhakar:2017:PRA, author = "Raghu Prabhakar and Yaqi Zhang and David Koeplinger and Matt Feldman and Tian Zhao and Stefan Hadjis and Ardavan Pedram and Christos Kozyrakis and Kunle Olukotun", title = "{Plasticine}: a Reconfigurable Architecture For Parallel Paterns", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "389--402", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080256", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Reconfigurable architectures have gained popularity in recent years as they allow the design of energy-efficient accelerators. Fine-grain fabrics (e.g. FPGAs) have traditionally suffered from performance and power inefficiencies due to bit-level reconfigurable abstractions. Both fine-grain and coarse-grain architectures (e.g. CGRAs) traditionally require low level programming and suffer from long compilation times. We address both challenges with Plasticine, a new spatially reconfigurable architecture designed to efficiently execute applications composed of parallel patterns. Parallel patterns have emerged from recent research on parallel programming as powerful, high-level abstractions that can elegantly capture data locality, memory access patterns, and parallelism across a wide range of dense and sparse applications. We motivate Plasticine by first observing key application characteristics captured by parallel patterns that are amenable to hardware acceleration, such as hierarchical parallelism, data locality, memory access patterns, and control flow. Based on these observations, we architect Plasticine as a collection of Pattern Compute Units and Pattern Memory Units. Pattern Compute Units are multi-stage pipelines of reconfigurable SIMD functional units that can efficiently execute nested patterns. Data locality is exploited in Pattern Memory Units using banked scratchpad memories and configurable address decoders. Multiple on-chip address generators and scatter-gather engines make efficient use of DRAM bandwidth by supporting a large number of outstanding memory requests, memory coalescing, and burst mode for dense accesses. Plasticine has an area footprint of 113 mm2 in a 28nm process, and consumes a maximum power of 49 W at a 1 GHz clock. Using a cycle-accurate simulator, we demonstrate that Plasticine provides an improvement of up to 76.9x in performance-per-Watt over a conventional FPGA over a wide range of dense and sparse applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kung:2017:PHA, author = "Jaeha Kung and Yun Long and Duckhwan Kim and Saibal Mukhopadhyay", title = "A Programmable Hardware Accelerator for Simulating Dynamical Systems", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "403--415", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080252", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The fast and energy-efficient simulation of dynamical systems defined by coupled ordinary/partial differential equations has emerged as an important problem. The accelerated simulation of coupled ODE/PDE is critical for analysis of physical systems as well as computing with dynamical systems. This paper presents a fast and programmable accelerator for simulating dynamical systems. The computing model of the proposed platform is based on multilayer cellular nonlinear network (CeNN) augmented with nonlinear function evaluation engines. The platform can be programmed to accelerate wide classes of ODEs/PDEs by modulating the connectivity within the multilayer CeNN engine. An innovative hardware architecture including data reuse, memory hierarchy, and near-memory processing is designed to accelerate the augmented multilayer CeNN. A dataflow model is presented which is supported by optimized memory hierarchy for efficient function evaluation. The proposed solver is designed and synthesized in 15nm technology for the hardware analysis. The performance is evaluated and compared to GPU nodes when solving wide classes of differential equations and the power consumption is analyzed to show orders of magnitude improvement in energy efficiency.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Nowatzki:2017:SDA, author = "Tony Nowatzki and Vinay Gangadhar and Newsha Ardalani and Karthikeyan Sankaralingam", title = "Stream-Dataflow Acceleration", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "416--429", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080255", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Demand for low-power data processing hardware continues to rise inexorably. Existing programmable and ``general purpose'' solutions (eg. SIMD, GPGPUs) are insufficient, as evidenced by the order-of-magnitude improvements and industry adoption of application and domain-specific accelerators in important areas like machine learning, computer vision and big data. The stark tradeoffs between efficiency and generality at these two extremes poses a difficult question: how could domain-specific hardware efficiency be achieved without domain-specific hardware solutions? In this work, we rely on the insight that ``acceleratable'' algorithms have broad common properties: high computational intensity with long phases, simple control patterns and dependences, and simple streaming memory access and reuse patterns. We define a general architecture (a hardware-software interface) which can more efficiently expresses program with these properties called stream-dataflow. The dataflow component of this architecture enables high concurrency, and the stream component enables communication and coordination at very-low power and area overhead. This paper explores the hardware and software implications, describes its detailed microarchitecture, and evaluates an implementation. Compared to a state-of-the-art domain specific accelerator (DianNao), and fixed-function accelerators for MachSuite, Softbrain can match their performance with only 2x power overhead on average.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yan:2017:HTC, author = "Zi Yan and J{\'a}n Vesel{\'y} and Guilherme Cox and Abhishek Bhattacharjee", title = "Hardware Translation Coherence for Virtualized Systems", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "430--443", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080211", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "To improve system performance, operating systems (OSes) often undertake activities that require modification of virtual-to-physical address translations. For example, the OS may migrate data between physical pages to manage heterogeneous memory devices. We refer to such activities as page remappings. Unfortunately, page remappings are expensive. We show that a big part of this cost arises from address translation coherence, particularly on systems employing virtualization. In response, we propose hardware translation invalidation and coherence or HATRIC, a readily implementable hardware mechanism to piggyback translation coherence atop existing cache coherence protocols. We perform detailed studies using KVM-based virtualization, showing that HATRIC achieves up to 30\% performance and 10\% energy benefits, for per-CPU area overheads of 0.2\%. We also quantify HATRIC's benefits on systems running Xen and find up to 33\% performance improvements.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Park:2017:HTC, author = "Chang Hyun Park and Taekyung Heo and Jungi Jeong and Jaehyuk Huh", title = "Hybrid {TLB} Coalescing: Improving {TLB} Translation Coverage under Diverse Fragmented Memory Allocations", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "444--456", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080217", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "To mitigate excessive TLB misses in large memory applications, techniques such as large pages, variable length segments, and HW coalescing, increase the coverage of limited hardware translation entries by exploiting the contiguous memory allocation. However, recent studies show that in non-uniform memory systems, using large pages often leads to performance degradation, or allocating large chunks of memory becomes more difficult due to memory fragmentation. Although each of the prior techniques favors its own best chunk size, diverse contiguity of memory allocation in real systems cannot always provide the optimal chunk of each technique. Under such fragmented and diverse memory allocations, this paper proposes a novel HW-SW hybrid translation architecture, which can adapt to different memory mappings efficiently. In the proposed hybrid coalescing technique, the operating system encodes memory contiguity information in a subset of page table entries, called anchor entries. During address translation through TLBs, an anchor entry provides translation for contiguous pages following the anchor entry. As a smaller number of anchor entries can cover a large portion of virtual address space, the efficiency of TLB can be significantly improved. The most important benefit of hybrid coalescing is its ability to change the coverage of the anchor entry dynamically, reflecting the current allocation contiguity status. By using the contiguity information directly set by the operating system, the technique can provide scalable translation coverage improvements with minor hardware changes, while allowing the flexibility of memory allocation. Our experimental results show that across diverse allocation scenarios with different distributions of contiguous memory chunks, the proposed scheme can effectively reap the potential translation coverage improvement from the existing contiguity.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Alam:2017:DIY, author = "Hanna Alam and Tianhao Zhang and Mattan Erez and Yoav Etsion", title = "Do-It-Yourself Virtual Memory Translation", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "457--468", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080209", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "In this paper, we introduce the Do-It-Yourself virtual memory translation (DVMT) architecture as a flexible complement for current hardware-fixed translation flows. DVMT decouples the virtual-to-physical mapping process from the access permissions, giving applications freedom in choosing mapping schemes, while maintaining security within the operating system. Furthermore, DVMT is designed to support virtualized environments, as a means to collapse the costly, hardware-assisted two-dimensional translations. We describe the architecture in detail and demonstrate its effectiveness by evaluating several different DVMT schemes on a range of virtualized applications with a model based on measurements from a commercial system. We show that different DVMT configurations preserve the native performance, while achieving speedups of 1.2x to 2.0x in virtualized environments.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ryoo:2017:RTD, author = "Jee Ho Ryoo and Nagendra Gulur and Shuang Song and Lizy K. John", title = "Rethinking {TLB} Designs in Virtualized Environments: a Very Large Part-of-Memory {TLB}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "469--480", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080210", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "With increasing deployment of virtual machines for cloud services and server applications, memory address translation overheads in virtualized environments have received great attention. In the radix-4 type of page tables used in x86 architectures, a TLB-miss necessitates up to 24 memory references for one guest to host translation. While dedicated page walk caches and such recent enhancements eliminate many of these memory references, our measurements on the Intel Skylake processors indicate that many programs in virtualized mode of execution still spend hundreds of cycles for translations that do not hit in the TLBs. This paper presents an innovative scheme to reduce the cost of address translations by using a very large Translation Lookaside Buffer that is part of memory, the POM-TLB. In the POM-TLB, only one access is required instead of up to 24 accesses required in commonly used 2D walks with radix-4 type of page tables. Even if many of the 24 accesses may hit in the page walk caches, the aggregated cost of the many hits plus the overhead of occasional misses from page walk caches still exceeds the cost of one access to the POM-TLB. Since the POM-TLB is part of the memory space, TLB entries (as opposed to multiple page table entries) can be cached in large L2 and L3 data caches, yielding significant benefits. Through detailed evaluation running SPEC, PARSEC and graph workloads, we demonstrate that the proposed POM-TLB improves performance by approximately 10\% on average. The improvement is more than 16\% for 5 of the benchmarks. It is further seen that a POM-TLB of 16MB size can eliminate nearly all TLB misses in 8-core systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kolli:2017:LLP, author = "Aasheesh Kolli and Vaibhav Gogte and Ali Saidi and Stephan Diestelhorst and Peter M. Chen and Satish Narayanasamy and Thomas F. Wenisch", title = "Language-level persistency", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "481--493", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080229", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The commercial release of byte-addressable persistent memories, such as Intel/Micron 3D XPoint memory, is imminent. Ongoing research has sought mechanisms to allow programmers to implement recoverable data structures in these new main memories. Ensuring recoverability requires programmer control of the order of persistent stores; recent work proposes persistency models as an extension to memory consistency to specify such ordering. Prior work has considered persistency models at the abstraction of the instruction set architecture. Instead, we argue for extending the language-level memory model to provide guarantees on the order of persistent writes. We explore a taxonomy of guarantees a language-level persistency model might provide, considering both atomicity and ordering constraints on groups of persistent stores. Then, we propose and evaluate Acquire-Release Persistency (ARP), a language-level persistency model for C++11. We describe how to compile code written for ARP to a state-of-the-art ISA-level persistency model. We then consider enhancements to the ISA-level persistency model that can distinguish memory consistency constraints required for proper synchronization but unnecessary for correct recovery. With these optimizations, we show that ARP increases performance by up to 33.2\% (19.8\% avg.) over coding directly to the baseline ISA-level persistency model for a suite of persistent-write-intensive workloads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Choi:2017:SAS, author = "Jiho Choi and Thomas Shull and Maria J. Garzaran and Josep Torrellas", title = "{ShortCut}: Architectural Support for Fast Object Access in Scripting Languages", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "494--506", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080237", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The same flexibility that makes dynamic scripting languages appealing to programmers is also the primary cause of their low performance. To access objects of potentially different types, the compiler creates a dispatcher with a series of if statements, each performing a comparison to a type and a jump to a handler. This induces major overhead in instructions executed and branches mispredicted. This paper proposes architectural support to significantly improve the efficiency of accesses to objects. The idea is to modify the instruction that calls the dispatcher so that, under most conditions, it skips most of the branches and instructions needed to reach the correct handler, and sometimes even the execution of the handler itself. Our novel architecture, called ShortCut, performs two levels of optimization. Its Plain design transforms the call to the dispatcher into a call to the correct handler --- bypassing the whole dispatcher execution. Its Aggressive design transforms the call to the dispatcher into a simple load or store --- bypassing the execution of both dispatcher and handler. We implement the ShortCut software in the state-of-the-art Google V8 JIT compiler, and the ShortCut hardware in a simulator. We evaluate ShortCut with the Octane and SunSpider JavaScript application suites. Plain ShortCut reduces the average execution time of the applications by 30\% running under the baseline compiler, and by 11\% running under the maximum level of compiler optimization. Aggressive ShortCut performs only slightly better.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Gope:2017:ASS, author = "Dibakar Gope and David J. Schlais and Mikko H. Lipasti", title = "Architectural Support for Server-Side {PHP} Processing", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "507--520", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080234", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "PHP is the dominant server-side scripting language used to implement dynamic web content. Just-in-time compilation, as implemented in Facebook's state-of-the-art HipHopVM, helps mitigate the poor performance of PHP, but substantial overheads remain, especially for realistic, large-scale PHP applications. This paper analyzes such applications and shows that there is little opportunity for conventional microarchitectural enhancements. Furthermore, prior approaches for function-level hardware acceleration present many challenges due to the extremely flat distribution of execution time across a large number of functions in these complex applications. In-depth analysis reveals a more promising alternative: targeted acceleration of four fine-grained PHP activities: hash table accesses, heap management, string manipulation, and regular expression handling. We highlight a set of guiding principles and then propose and evaluate inexpensive hardware accelerators for these activities that accrue substantial performance and energy gains across dozens of functions. Our results reflect an average 17.93\% improvement in performance and 21.01\% reduction in energy while executing these complex PHP workloads on a state-of-the-art software and hardware platform.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kannan:2017:HDH, author = "Sudarsun Kannan and Ada Gavrilovska and Vishal Gupta and Karsten Schwan", title = "{HeteroOS}: {OS} Design for Heterogeneous Memory Management in Datacenter", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "521--534", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080245", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Heterogeneous memory management combined with server virtualization in datacenters is expected to increase the software and OS management complexity. State-of-the-art solutions rely exclusively on the hypervisor (VMM) for expensive page hotness tracking and migrations, limiting the benefits from heterogeneity. To address this, we design HeteroOS, a novel application-transparent OS-level solution for managing memory heterogeneity in virtualized system. The HeteroOS design first makes the guest-OSes heterogeneity-aware and then extracts rich OS-level information about applications' memory usage to place data in the 'right' memory avoiding page migrations. When such pro-active placements are not possible, HeteroOS combines the power of the guest-OSes' information about applications with the VMM's hardware control to track for hotness and migrate only performance-critical pages. Finally, HeteroOS also designs an efficient heterogeneous memory sharing across multiple guest-VMs. Evaluation of HeteroOS with memory, storage, and network-intensive datacenter applications shows up to 2x performance improvement compared to the state-of-the-art VMM-exclusive approach.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Shen:2017:MCA, author = "Yongming Shen and Michael Ferdman and Peter Milder", title = "Maximizing {CNN} Accelerator Efficiency Through Resource Partitioning", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "535--547", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080221", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Convolutional neural networks (CNNs) are revolutionizing machine learning, but they present significant computational challenges. Recently, many FPGA-based accelerators have been proposed to improve the performance and efficiency of CNNs. Current approaches construct a single processor that computes the CNN layers one at a time; the processor is optimized to maximize the throughput at which the collection of layers is computed. However, this approach leads to inefficient designs because the same processor structure is used to compute CNN layers of radically varying dimensions. We present a new CNN accelerator paradigm and an accompanying automated design methodology that partitions the available FPGA resources into multiple processors, each of which is tailored for a different subset of the CNN convolutional layers. Using the same FPGA resources as a single large processor, multiple smaller specialized processors increase computational efficiency and lead to a higher overall throughput. Our design methodology achieves 3.8x higher throughput than the state-of-the-art approach on evaluating the popular AlexNet CNN on a Xilinx Virtex-7 FPGA. For the more recent SqueezeNet and GoogLeNet, the speedups are 2.2x and 2.0x.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Yu:2017:SCD, author = "Jiecao Yu and Andrew Lukefahr and David Palframan and Ganesh Dasika and Reetuparna Das and Scott Mahlke", title = "{Scalpel}: Customizing {DNN} Pruning to the Underlying Hardware Parallelism", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "548--560", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080215", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "As the size of Deep Neural Networks (DNNs) continues to grow to increase accuracy and solve more complex problems, their energy footprint also scales. Weight pruning reduces DNN model size and the computation by removing redundant weights. However, we implemented weight pruning for several popular networks on a variety of hardware platforms and observed surprising results. For many networks, the network sparsity caused by weight pruning will actually hurt the overall performance despite large reductions in the model size and required multiply-accumulate operations. Also, encoding the sparse format of pruned networks incurs additional storage space overhead. To overcome these challenges, we propose Scalpel that customizes DNN pruning to the underlying hardware by matching the pruned network structure to the data-parallel hardware organization. Scalpel consists of two techniques: SIMD-aware weight pruning and node pruning. For low-parallelism hardware (e.g., microcontroller), SIMD-aware weight pruning maintains weights in aligned fixed-size groups to fully utilize the SIMD units. For high-parallelism hardware (e.g., GPU), node pruning removes redundant nodes, not redundant weights, thereby reducing computation without sacrificing the dense matrix format. For hardware with moderate parallelism (e.g., desktop CPU), SIMD-aware weight pruning and node pruning are synergistically applied together. Across the microcontroller, CPU and GPU, Scalpel achieves mean speedups of 3.54x, 2.61x, and 1.25x while reducing the model sizes by 88\%, 82\%, and 53\%. In comparison, traditional weight pruning achieves mean speedups of 1.90x, 1.06x, 0.41x across the three platforms.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Sa:2017:UOA, author = "Christopher {De Sa} and Matthew Feldman and Christopher R{\'e} and Kunle Olukotun", title = "Understanding and Optimizing Asynchronous Low-Precision Stochastic Gradient Descent", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "561--574", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080248", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Stochastic gradient descent (SGD) is one of the most popular numerical algorithms used in machine learning and other domains. Since this is likely to continue for the foreseeable future, it is important to study techniques that can make it run fast on parallel hardware. In this paper, we provide the first analysis of a technique called Buck-wild! that uses both asynchronous execution and low-precision computation. We introduce the DMGC model, the first conceptualization of the parameter space that exists when implementing low-precision SGD, and show that it provides a way to both classify these algorithms and model their performance. We leverage this insight to propose and analyze techniques to improve the speed of low-precision SGD. First, we propose software optimizations that can increase throughput on existing CPUs by up to 11X. Second, we propose architectural changes, including a new cache technique we call an obstinate cache, that increase throughput beyond the limits of current-generation hardware. We also implement and analyze low-precision SGD on the FPGA, which is a promising alternative to the CPU for future SGD systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Li:2017:API, author = "Zhaoshi Li and Leibo Liu and Yangdong Deng and Shouyi Yin and Yao Wang and Shaojun Wei", title = "Aggressive Pipelining of Irregular Applications on Reconfigurable Hardware", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "575--586", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080228", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "CPU-FPGA heterogeneous platforms offer a promising solution for high-performance and energy-efficient computing systems by providing specialized accelerators with post-silicon reconfigurability. To unleash the power of FPGA, however, the programmability gap has to be filled so that applications specified in high-level programming languages can be efficiently mapped and scheduled on FPGA. The above problem is even more challenging for irregular applications, in which the execution dependency can only be determined at run time. Thus over-serialized accelerators are generated from existing works that rely on compile time analysis to schedule the computation. In this work, we propose a comprehensive software-hardware co-design framework, which captures parallelism in irregular applications and aggressively schedules pipelined execution on reconfigurable platform. Based on an inherently parallel abstraction packaging parallelism for runtime schedule, our framework significantly differs from existing works that tend to schedule executions at compile time. An irregular application is formulated as a set of tasks with their dependencies specified as rules describing the conditions under which a subset of tasks can be executed concurrently. Then datapaths on FPGA will be generated by transforming applications in the formulation into task pipelines orchestrated by evaluating rules at runtime, which could exploit fine-grained pipeline parallelism as handcrafted accelerators do. An evaluation shows that this framework is able to produce datapath with its quality close to handcrafted designs. Experiments show that generated accelerators are dramatically more efficient than those created by current high-level synthesis tools. Meanwhile, accelerators generated for a set of irregular applications attain 0.5x~1.9x performance compared to equivalent software implementations we selected on a server-grade 10-core processor, with the memory subsystem remaining as the bottleneck.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Subramanian:2017:FEM, author = "Suvinay Subramanian and Mark C. Jeffrey and Maleen Abeydeera and Hyun Ryong Lee and Victor A. Ying and Joel Emer and Daniel Sanchez", title = "Fractal: an Execution Model for Fine-Grain Nested Speculative Parallelism", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "587--599", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080218", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Most systems that support speculative parallelization, like hardware transactional memory (HTM), do not support nested parallelism. This sacrifices substantial parallelism and precludes composing parallel algorithms. And the few HTMs that do support nested parallelism focus on parallelizing at the coarsest (shallowest) levels, incurring large overheads that squander most of their potential. We present FRACTAL, a new execution model that supports unordered and timestamp-ordered nested parallelism. FRACTAL lets programmers seamlessly compose speculative parallel algorithms, and lets the architecture exploit parallelism at all levels. FRACTAL can parallelize a broader range of applications than prior speculative execution models. We design a FRACTAL implementation that extends the Swarm architecture and focuses on parallelizing at the finest (deepest) levels. Our approach sidesteps the issues of nested parallel HTMs and uncovers abundant fine-grain parallelism. As a result, FRACTAL outperforms prior speculative architectures by up to 88x at 256 cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Subramaniyan:2017:PAP, author = "Arun Subramaniyan and Reetuparna Das", title = "Parallel Automata Processor", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "600--612", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080207", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Finite State Machines (FSM) are widely used computation models for many application domains. These embarrassingly sequential applications with irregular memory access patterns perform poorly on conventional von-Neumann architectures. The Micron Automata Processor (AP) is an in-situ memory-based computational architecture that accelerates non-deterministic finite automata (NFA) processing in hardware. However, each FSM on the AP is processed sequentially, limiting potential speedups. In this paper, we explore the FSM parallelization problem in the context of the AP. Extending classical parallelization techniques to NFAs executing on AP is non-trivial because of high state-transition tracking overheads and exponential computation complexity. We present the associated challenges and propose solutions that leverage both the unique properties of the NFAs (connected components, input symbol ranges, convergence, common parent states) and unique features in the AP (support for simultaneous transitions, low-overhead flow switching, state vector cache) to realize parallel NFA execution on the AP. We evaluate our techniques against several important benchmarks including NFAs used for network intrusion detection, malware detection, text processing, protein motif searching, DNA sequencing, and data analytics. Our proposed parallelization scheme demonstrates significant speedup (25.5x on average) compared to sequential execution on AP. Prior work has already shown that sequential execution on AP is at least an order of magnitude better than GPUs, multi-core processors and Xeon Phi accelerator.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Kateja:2017:VDB, author = "Rajat Kateja and Anirudh Badam and Sriram Govindan and Bikash Sharma and Greg Ganger", title = "{Viyojit}: Decoupling Battery and {DRAM} Capacities for Battery-Backed {DRAM}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "613--626", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080236", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Non-Volatile Memories (NVMs) can significantly improve the performance of data-intensive applications. A popular form of NVM is Battery-backed DRAM, which is available and in use today with DRAMs latency and without the endurance problems of emerging NVM technologies. Modern servers can be provisioned with up-to 4 TB of DRAM, and provisioning battery backup to write out such large memories is hard because of the large battery sizes and the added hardware and cooling costs. We present Viyojit, a system that exploits the skew in write working sets of applications to provision substantially smaller batteries while still ensuring durability for the entire DRAM capacity. Viyojit achieves this by bounding the number of dirty pages in DRAM based on the provisioned battery capacity and proactively writing out infrequently written pages to an SSD. Even for write-heavy workloads with less skew than we observe in analysis of real data center traces, Viyojit reduces the required battery capacity to 11\% of the original size, with a performance overhead of 7-25\%. Thus, Viyojit frees battery-backed DRAM from stunted growth of battery capacities and enables servers with terabytes of battery-backed DRAM.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Young:2017:DCD, author = "Vinson Young and Prashant J. Nair and Moinuddin K. Qureshi", title = "{DICE}: Compressing {DRAM} Caches for Bandwidth and Capacity", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "627--638", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080243", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This paper investigates compression for DRAM caches. As the capacity of DRAM cache is typically large, prior techniques on cache compression, which solely focus on improving cache capacity, provide only a marginal benefit. We show that more performance benefit can be obtained if the compression of the DRAM cache is tailored to provide higher bandwidth. If a DRAM cache can provide two compressed lines in a single access, and both lines are useful, the effective bandwidth of the DRAM cache would double. Unfortunately, it is not straight-forward to compress DRAM caches for bandwidth. The typically used Traditional Set Indexing (TSI) maps consecutive lines to consecutive sets, so the multiple compressed lines obtained from the set are from spatially distant locations and unlikely to be used within a short period of each other. We can change the indexing of the cache to place consecutive lines in the same set to improve bandwidth; however, when the data is incompressible, such spatial indexing reduces effective capacity and causes significant slowdown. Ideally, we would like to have spatial indexing when the data is compressible and TSI otherwise. To this end, we propose Dynamic-Indexing Cache comprEssion (DICE), a dynamic design that can adapt between spatial indexing and TSI, depending on the compressibility of the data. We also propose low-cost Cache Index Predictors (CIP) that can accurately predict the cache indexing scheme on access in order to avoid probing both indices for retrieving a given cache line. Our studies with a 1GB DRAM cache, on a wide range of workloads (including SPEC and Graph), show that DICE improves performance by 19.0\% and reduces energy-delay-product by 36\% on average. DICE is within 3\% of a design that has double the capacity and double the bandwidth. DICE incurs a storage overhead of less than 1KB and does not rely on any OS support.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Drumond:2017:MDE, author = "Mario Drumond and Alexandros Daglis and Nooshin Mirzadeh and Dmitrii Ustiugov and Javier Picorel and Babak Falsafi and Boris Grot and Dionisios Pnevmatikatos", title = "The {Mondrian Data Engine}", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "639--651", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080233", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The increasing demand for extracting value out of ever-growing data poses an ongoing challenge to system designers, a task only made trickier by the end of Dennard scaling. As the performance density of traditional CPU-centric architectures stagnates, advancing compute capabilities necessitates novel architectural approaches. Near-memory processing (NMP) architectures are reemerging as promising candidates to improve computing efficiency through tight coupling of logic and memory. NMP architectures are especially fitting for data analytics, as they provide immense bandwidth to memory-resident data and dramatically reduce data movement, the main source of energy consumption. Modern data analytics operators are optimized for CPU execution and hence rely on large caches and employ random memory accesses. In the context of NMP, such random accesses result in wasteful DRAM row buffer activations that account for a significant fraction of the total memory access energy. In addition, utilizing NMP's ample bandwidth with fine-grained random accesses requires complex hardware that cannot be accommodated under NMP's tight area and power constraints. Our thesis is that efficient NMP calls for an algorithm-hardware co-design that favors algorithms with sequential accesses to enable simple hardware that accesses memory in streams. We introduce an instance of such a co-designed NMP architecture for data analytics, the Mondrian Data Engine. Compared to a CPU-centric and a baseline NMP system, the Mondrian Data Engine improves the performance of basic data analytics operators by up to 49x and 5x, and efficiency by up to 28x and 5x, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Tsai:2017:JSD, author = "Po-An Tsai and Nathan Beckmann and Daniel Sanchez", title = "{Jenga}: Software-Defined Cache Hierarchies", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "652--665", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080214", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Caches are traditionally organized as a rigid hierarchy, with multiple levels of progressively larger and slower memories. Hierarchy allows a simple, fixed design to benefit a wide range of applications, since working sets settle at the smallest (i.e., fastest and most energy-efficient) level they fit in. However, rigid hierarchies also add overheads, because each level adds latency and energy even when it does not fit the working set. These overheads are expensive on emerging systems with heterogeneous memories, where the differences in latency and energy across levels are small. Significant gains are possible by specializing the hierarchy to applications. We propose Jenga, a reconfigurable cache hierarchy that dynamically and transparently specializes itself to applications. Jenga builds virtual cache hierarchies out of heterogeneous, distributed cache banks using simple hardware mechanisms and an OS runtime. In contrast to prior techniques that trade energy and bandwidth for performance (e.g., dynamic bypassing or prefetching), Jenga eliminates accesses to unwanted cache levels. Jenga thus improves both performance and energy efficiency. On a 36-core chip with a 1 GB DRAM cache, Jenga improves energy-delay product over a combination of state-of-the-art techniques by 23\% on average and by up to 85\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Boyapati:2017:AND, author = "Rahul Boyapati and Jiayi Huang and Pritam Majumder and Ki Hwan Yum and Eun Jung Kim", title = "{APPROX-NoC}: a Data Approximation Framework for Network-On-Chip Architectures", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "666--677", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080241", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "The trend of unsustainable power consumption and large memory bandwidth demands in massively parallel multicore systems, with the advent of the big data era, has brought upon the onset of alternate computation paradigms utilizing heterogeneity, specialization, processor-in-memory and approximation. Approximate Computing is being touted as a viable solution for high performance computation by relaxing the accuracy constraints of applications. This trend has been accentuated by emerging data intensive applications in domains like image/video processing, machine learning and big data analytics that allow inaccurate outputs within an acceptable variance. Leveraging relaxed accuracy for high throughput in Networks-on-Chip (NoCs), which have rapidly become the accepted method for connecting a large number of on-chip components, has not yet been explored. We propose APPROX-NoC, a hardware data approximation framework with an online data error control mechanism for high performance NoCs. APPROX-NoC facilitates approximate matching of data patterns, within a controllable value range, to compress them thereby reducing the volume of data movement across the chip. Our evaluation shows that APPROX-NoC achieves on average up to 9\% latency reduction and 60\% throughput improvement compared with state-of-the-art NoC data compression mechanisms, while maintaining low application error. Additionally, with a data intensive graph processing application we achieve a 36.7\% latency reduction compared to state-of-the-art compression mechanisms.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Poremba:2017:TBA, author = "Matthew Poremba and Itir Akgun and Jieming Yin and Onur Kayiran and Yuan Xie and Gabriel H. Loh", title = "There and Back Again: Optimizing the Interconnect in Networks of Memory Cubes", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "678--690", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080251", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "High-performance computing, enterprise, and datacenter servers are driving demands for higher total memory capacity as well as memory performance. Memory ``cubes'' with high per-package capacity (from 3D integration) along with high-speed point-to-point interconnects provide a scalable memory system architecture with the potential to deliver both capacity and performance. Multiple such cubes connected together can form a ``Memory Network'' (MN), but the design space for such MNs is quite vast, including multiple topology types and multiple memory technologies per memory cube. In this work, we first analyze several MN topologies with different mixes of memory package technologies to understand the key tradeoffs and bottlenecks for such systems. We find that most of a MN's performance challenges arise from the interconnection network that binds the memory cubes together. In particular, arbitration schemes used to route through MNs, ratio of NVM to DRAM, and specific topologies used have dramatic impact on performance and energy results. Our initial analysis indicates that introducing non-volatile memory to the MN presents a unique tradeoff between memory array latency and network latency. We observe that placing NVM cubes in a specific order in the MN improves performance by reducing the network size/diameter up to a certain NVM to DRAM ratio. Novel MN topologies and arbitration schemes also provide performance and energy deltas by reducing the hop count of requests and response in the MN. Based on our analyses, we introduce three techniques to address MN latency issues: (1) Distance-based arbitration scheme to improve queuing latencies throughout the network, (2) skip-list topology, derived from the classic data structure, to improve network latency and link usage, and (3) the MetaCube, a denser memory cube that leverages advanced packaging technologies to improve latency by reducing MN size.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Fu:2017:FRR, author = "Binzhang Fu and John Kim", title = "{Footprint}: Regulating Routing Adaptiveness in Networks-on-Chip", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "691--702", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080249", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Routing algorithms can improve network performance by maximizing routing adaptiveness but can be problematic in the presence of endpoint congestion. Tree-saturation is a well-known behavior caused by endpoint congestion. Adaptive routing can, however, spread the congestion and result in thick branches of the congestion tree --- creating Head-of-Line (HoL) blocking and degrading performance. In this work, we identify how ignoring virtual channels (VCs) and their occupancy during adaptive routing results in congestion trees with thick branches as congestion is spread to all VCs. To address this limitation, we propose Footprint routing algorithm --- a new adaptive routing algorithm that minimizes the size of the congestion tree, both in terms of the number of nodes in the congestion tree as well as branch thickness. Footprint achieves this by regulating adaptiveness by requiring packets to follow the path of prior packets to the same destination if the network is congested instead of forking a new path or VC. Thus, the congestion tree is dynamically kept as slim as possible and reduces HoL blocking or congestion spreading while maintaining high adaptivity and maximizing VC buffer utilization. We evaluate the proposed Footprint routing algorithm against other adaptive routing algorithms and our simulation results show that the network saturation throughput can be improved by up to 43\% (58\%) compared with the fully adaptive routing (partially adaptive routing) algorithms.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Article{Ebrahimi:2017:ENT, author = "Masoumeh Ebrahimi and Masoud Daneshtalab", title = "{EbDa}: a New Theory on Design and Verification of Deadlock-free Interconnection Networks", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "703--715", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080253", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Freedom from deadlock is one of the most important issues when designing routing algorithms in on-chip/off-chip networks. Many works have been developed upon Dally's theory proving that a network is deadlock-free if there is no cyclic dependency on the channel dependency graph. However, finding such acyclic graph has been very challenging, which limits Dally's theory to networks with a low number of channels. In this paper, we introduce three theorems that directly lead to routing algorithms with an acyclic channel dependency graph. We also propose the partitioning methodology, enabling a design to reach the maximum adaptiveness for the n-dimensional mesh and k-ary n-cube topologies with any given number of channels. In addition, deadlock-free routing algorithms can be derived ranging from maximally fully adaptive routing down to deterministic routing. The proposed theorems can drastically remove the difficulties of designing deadlock-free routing algorithms.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } %%% ==================================================================== %%% Conference proceedings papers not included in regular issues: @InProceedings{Lipovski:1998:RBN, author = "Jack Lipovski", title = "Retrospective: {Banyan} networks for partitioning multiprocessor systems", crossref = "ACM:1998:PAI", pages = "1--1", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Dennis:1998:RPA, author = "Jack B. Dennis", title = "Retrospective: a preliminary architecture for a basic data flow processor", crossref = "ACM:1998:PAI", pages = "2--4", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Patel:1998:RIT, author = "Janak H. Patel", title = "Retrospective: {Improving} the throughput of a pipeline by insertion of delays", crossref = "ACM:1998:PAI", pages = "5--5", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Bell:1998:RWW, author = "Gorden Bell and W. D. Strecker", title = "Retrospective: {What} have we learned from the {PDP-11} --- what we have learned from {VAX} and {Alpha}", crossref = "ACM:1998:PAI", pages = "6--10", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Shustek:1998:RIT, author = "Leonard J. Shustek and Bernard L. Peuto", title = "Retrospective: an instruction timing model of {CPU} performance", crossref = "ACM:1998:PAI", pages = "11--12", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Ditzel:1998:RRH, author = "David R. Ditzel and David A. Patterson", title = "Retrospective: a retrospective on high-level language computer architecture", crossref = "ACM:1998:PAI", pages = "13--14", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Batcher:1998:RAM, author = "Ken Batcher", title = "Retrospective: {Architecture} of a massively parallel processor", crossref = "ACM:1998:PAI", pages = "15--16", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Pier:1998:RPH, author = "Ken Pier", title = "Retrospective: a processor for a high-performance personal computer", crossref = "ACM:1998:PAI", pages = "17--19", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Kroft:1998:RLF, author = "David Kroft", title = "Retrospective: {Lockup}-free instruction fetch\slash prefetch cache organization", crossref = "ACM:1998:PAI", pages = "20--21", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Smith:1998:RSB, author = "James E. Smith", title = "Retrospective: a study of branch prediction strategies", crossref = "ACM:1998:PAI", pages = "22--23", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Patterson:1998:RRR, author = "David A. Patterson and Carlo H. S{\'e}quin", title = "Retrospective: {RISC I}: a {Reduced Instruction Set Computer}", crossref = "ACM:1998:PAI", pages = "24--26", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "This paper contains in column 1, page 25, the story of the origin of the name ``RISC''.", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Smith:1998:RDA, author = "James E. Smith", title = "Retrospective: {Decoupled} access\slash execute architectures", crossref = "ACM:1998:PAI", pages = "27--28", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Gottlieb:1998:RPR, author = "Allan Gottlieb", title = "Retrospective: a personal retrospective on the {NYU} ultracomputer", crossref = "ACM:1998:PAI", pages = "29--31", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Goodman:1998:RUC, author = "James R. Goodman", title = "Retrospective: {Using} cache memory to reduce processor-memory traffic", crossref = "ACM:1998:PAI", pages = "32--33", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Fisher:1998:RVL, author = "Joseph A. Fisher", title = "Retrospective: {Very} long instruction word architectures and the {ELI}-512", crossref = "ACM:1998:PAI", pages = "34--36", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Emer:1998:RCP, author = "Joel S. Emer and Douglas W. Clark", title = "Retrospective: {Characterization} of processor performance in the {VAX-11\slash 780}", crossref = "ACM:1998:PAI", pages = "37--38", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Patel:1998:RLO, author = "Janak H. Patel", title = "Retrospective: a low-overhead coherence solution for multiprocessors with private cache memories", crossref = "ACM:1998:PAI", pages = "39--41", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Smith:1998:RIP, author = "James E. Smith", title = "Retrospective: {Implementing} precise interrupts in pipelined processors", crossref = "ACM:1998:PAI", pages = "42--42", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Hwu:1998:RHH, author = "Wen-mei W. Hwu and Yale N. Patt", title = "Retrospective: {HPSm}, a high performance restricted data flow architecture having minimal functionality", crossref = "ACM:1998:PAI", pages = "43--44", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Gross:1998:RRW, author = "Thomas Gross and Monica Lam", title = "Retrospective: a retrospective on the {Warp} machines", crossref = "ACM:1998:PAI", pages = "45--47", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Dubois:1998:RMA, author = "Michel Dubois and Christoph Scheurich", title = "Retrospective: {Memory} access buffering in multiprocessors", crossref = "ACM:1998:PAI", pages = "48--50", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Sohi:1998:RII, author = "Gurindar S. Sohi", title = "Retrospective: {Instruction} issue logic for high-performance, interruptible pipelined processors", crossref = "ACM:1998:PAI", pages = "51--53", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Dally:1998:RJM, author = "William J. Dally and Andrew Chien and Stuart Fiske and Waldemar Horwat and Richard Lethin and Michael Noakes and Peter Nuth and Ellen Spertus and Deborah Wallach and D. Scott Wills and Andrew Chang and John Keen", title = "Retrospective: {The} {J}-machine", crossref = "ACM:1998:PAI", pages = "54--58", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Baer:1998:RIP, author = "Jean-Loup Baer and Wen-Hann Wang", title = "Retrospective: {On} the inclusion properties for multi-level cache hierarchies", crossref = "ACM:1998:PAI", pages = "59--60", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Hennessy:1998:RED, author = "John Hennessy", title = "Retrospective: {Evaluation} of directory schemes for cache coherence", crossref = "ACM:1998:PAI", pages = "61--62", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Adve:1998:RWO, author = "Sarita V. Adve and Mark D. Hill", title = "Retrospective: {Weak} ordering --- a new definition", crossref = "ACM:1998:PAI", pages = "63--66", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Gharachorloo:1998:RMC, author = "Kourosh Gharachorloo", title = "Retrospective: {Memory} consistency and event ordering in scalable shared-memory multiprocessors", crossref = "ACM:1998:PAI", pages = "67--70", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Jouppi:1998:RID, author = "Norman P. Jouppi", title = "Retrospective: {Improving} direct-mapped cache performance by the addition of a small fully-associative cache and prefetch buffers", crossref = "ACM:1998:PAI", pages = "71--73", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Papadopoulos:1998:RME, author = "George M. Papadopoulos and David E. Culler", title = "Retrospective: {Monsoon}: an explicit token-store architecture", crossref = "ACM:1998:PAI", pages = "74--76", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Hwu:1998:RIA, author = "Wen-mei W. Hwu", title = "Retrospective: {Impact}: an architectural framework for multiple-instruction issue", crossref = "ACM:1998:PAI", pages = "77--79", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Lenoski:1998:RDP, author = "Daniel E. Lenoski and James P. Laudon", title = "Retrospective: {The} {DASH} prototype: implementation and performance", crossref = "ACM:1998:PAI", pages = "80--82", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{vonEicken:1998:RAM, author = "Thorsten von Eicken and David E. Culler and Klaus Erik Schauser and Seth Copen Goldstein", title = "Retrospective: {Active} messages: a mechanism for integrating computation and communication", crossref = "ACM:1998:PAI", pages = "83--84", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Ni:1998:RTM, author = "Lionel Ni", title = "Retrospective: {The} turn model for adaptive routing", crossref = "ACM:1998:PAI", pages = "85--86", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Yeh:1998:RAI, author = "Tse-Yu Yeh and Yale N. Patt", title = "Retrospective: {Alternative} implementations of two-level adaptive training branch prediction", crossref = "ACM:1998:PAI", pages = "87--88", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Veidenbaum:1998:RCS, author = "A. Veidenbaum and P.-C. Yew and D. J. Kuck and C. D. Polychronopoulos and D. H. Padua and E. S. Davidson and K. Gallivan", title = "Retrospective: {The} {Cedar} system", crossref = "ACM:1998:PAI", pages = "89--91", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Blumrich:1998:RVM, author = "Matthias A. Blumrich and Kai Li and Richard D. Alpert and Cezary Dubnicki and Edward W. Felten and Jonathan Sandberg", title = "Retrospective: {Virtual} memory mapped network interface for the {SHRIMP} multicomputer", crossref = "ACM:1998:PAI", pages = "92--94", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Kuskin:1998:RSF, author = "Jeffrey S. Kuskin", title = "Retrospective: {The} {Stanford FLASH} multiprocessor", crossref = "ACM:1998:PAI", pages = "95--97", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Reinhardt:1998:RTT, author = "Steven K. Reinhardt and James R. Larus and David A. Wood", title = "Retrospective: {Tempest} and {Typhoon}: user-level shared memory", crossref = "ACM:1998:PAI", pages = "98--102", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Agarwal:1998:RAM, author = "Anant Agarwal", title = "Retrospective: {The} {MIT Alewife} machine: architecture and performance", crossref = "ACM:1998:PAI", pages = "103--110", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Sohi:1998:RMP, author = "Gurindar Sohi", title = "Retrospective: {Multiscalar} processors", crossref = "ACM:1998:PAI", pages = "111--114", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Tullsen:1998:RSM, author = "Dean M. Tullsen and Susan J. Eggers and Henry M. Levy", title = "Retrospective: {Simultaneous} multithreading: maximizing on-chip parallelism", crossref = "ACM:1998:PAI", pages = "115--116", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Goke:1998:BNP, author = "L. Rodney Goke and G. J. Lipovski", title = "{Banyan} networks for partitioning multiprocessor systems", crossref = "ACM:1998:PAI", pages = "117--124", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Dennis:1998:PAB, author = "Jack B. Dennis and David P. Misunas", title = "A preliminary architecture for a basic data-flow processor", crossref = "ACM:1998:PAI", pages = "125--131", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Patel:1998:ITP, author = "Janak H. Patel and Edward S. Davidson", title = "Improving the throughput of a pipeline by insertion of delays", crossref = "ACM:1998:PAI", pages = "132--137", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Bell:1998:CSW, author = "Gordon Bell and William D. Strecker", title = "Computer structures: what have we learned from the {PDP-11}?", crossref = "ACM:1998:PAI", pages = "138--151", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Peuto:1998:ITM, author = "Bernard L. Peuto and Leonard J. Shustek", title = "An instruction timing model of {CPU} performance", crossref = "ACM:1998:PAI", pages = "152--165", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Ditzel:1998:RHL, author = "David R. Ditzel and David A. Patterson", title = "Retrospective on high-level language computer architecture", crossref = "ACM:1998:PAI", pages = "166--173", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Batcher:1998:AMP, author = "Kenneth E. Batcher", title = "Architecture of a massively parallel processor", crossref = "ACM:1998:PAI", pages = "174--179", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Lampson:1998:PHP, author = "Butler W. Lampson and Kenneth A. Pier", title = "A processor for a high-performance personal computer", crossref = "ACM:1998:PAI", pages = "180--194", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Kroft:1998:LFI, author = "David Kroft", title = "Lockup-free instruction fetch\slash prefetch cache organization", crossref = "ACM:1998:PAI", pages = "195--201", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Smith:1998:SBP, author = "James E. Smith", title = "A study of branch prediction strategies", crossref = "ACM:1998:PAI", pages = "202--215", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Patterson:1998:RRI, author = "David A. Patterson and Carlo H. Sequin", title = "{RISC I}: a reduced instruction set {VLSI} computer", crossref = "ACM:1998:PAI", pages = "216--230", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Smith:1998:DAE, author = "James E. Smith", title = "Decoupled access\slash execute computer architectures", crossref = "ACM:1998:PAI", pages = "231--238", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Gottlieb:1998:NUD, author = "Allan Gottlieb and Ralph Grishman and Clyde P. Kruskal and Kevin P. McAuliffe and Larry Rudolph and Marc Snir", title = "The {NYU Ultracomputer} --- designing a {MIMD}, shared-memory parallel machine", crossref = "ACM:1998:PAI", pages = "239--254", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Goodman:1998:UCM, author = "James R. Goodman", title = "Using cache memory to reduce processor-memory traffic", crossref = "ACM:1998:PAI", pages = "255--262", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Fisher:1998:VLI, author = "Joseph A. Fisher", title = "Very long instruction word architectures and the {ELI-512}", crossref = "ACM:1998:PAI", pages = "263--273", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Emer:1998:CPP, author = "Joel S. Emer and Douglas W. Clark", title = "A characterization of processor performance in the {VAX-11\slash 780}", crossref = "ACM:1998:PAI", pages = "274--283", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Papamarcos:1998:LOC, author = "Mark S. Papamarcos and Janak H. Patel", title = "A low-overhead coherence solution for multiprocessors with private cache memories", crossref = "ACM:1998:PAI", pages = "284--290", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Smith:1998:IPI, author = "James E. Smith and Andrew R. Pleszkun", title = "Implementation of precise interrupts in pipelined processors", crossref = "ACM:1998:PAI", pages = "291--299", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Hwu:1998:HHP, author = "Wen-Wei Hwu and Yale N. Patt", title = "{HPSm}, a high performance restricted data flow architecture having minimal functionality", crossref = "ACM:1998:PAI", pages = "300--308", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Annaratone:1998:WAI, author = "Marco Annaratone and Emmanuel Arnould and Thomas Gross and H. T. Kung and Monica S. Lam and Onat Menzilcio{\u{g}}lu and Ken Sarocky and Jon A. Webb", title = "{Warp} architecture and implementation", crossref = "ACM:1998:PAI", pages = "309--319", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Dubois:1998:MAB, author = "Michel Dubois and Christoph Scheurich and Faye Briggs", title = "Memory access buffering in multiprocessors", crossref = "ACM:1998:PAI", pages = "320--328", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Sohi:1998:IIL, author = "Gurindar S. Sohi and Sriram Vajapeyam", title = "Instruction issue logic for high-performance, interruptible pipelined processors", crossref = "ACM:1998:PAI", pages = "329--336", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Dally:1998:AMD, author = "William J. Dally and Linda Chao and Andrew Chien and Soha Hassoun and Waldemar Horwat and Jon Kaplan and Paul Song and Brian Totty and Scott Wills", title = "Architecture of a message-driven processor", crossref = "ACM:1998:PAI", pages = "337--344", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Baer:1998:IPM, author = "Jean-Loup Baer and Wen-Hann Wang", title = "On the inclusion properties for multi-level cache hierarchies", crossref = "ACM:1998:PAI", pages = "345--352", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Agarwal:1998:EDS, author = "Anant Agarwal and Richard Simoni and John Hennessy and Mark Horowitz", title = "An evaluation of directory schemes for cache coherence", crossref = "ACM:1998:PAI", pages = "353--362", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Adve:1998:WON, author = "Sarita V. Adve and Mark D. Hill", title = "Weak ordering --- a new definition", crossref = "ACM:1998:PAI", pages = "363--375", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Gharachorloo:1998:MCE, author = "Kourosh Gharachorloo and Daniel Lenoski and James Laudon and Phillip Gibbons and Anoop Gupta and John Hennessy", title = "Memory consistency and event ordering in scalable shared-memory multiprocessors", crossref = "ACM:1998:PAI", pages = "376--387", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Jouppi:1998:IDM, author = "Norman P. Jouppi", title = "Improving direct-mapped cache performance by the addition of a small fully-associative cache prefetch buffers", crossref = "ACM:1998:PAI", pages = "388--397", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Papadopoulos:1998:MET, author = "Gregory M. Papadopoulos and David E. Culler", title = "{Monsoon}: an explicit token-store architecture", crossref = "ACM:1998:PAI", pages = "398--407", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Chang:1998:IAF, author = "Pohua P. Chang and Scott A. Mahlke and William Y. Chen and Nancy J. Warter and Wen-mei W. Hwu", title = "{IMPACT}: an architectural framework for multiple-instruction-issue processors", crossref = "ACM:1998:PAI", pages = "408--417", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Lenoski:1998:DPI, author = "Daniel Lenoski and James Laudon and Truman Joe and David Nakahira and Luis Stevens and Anoop Gupta and John Hennessy", title = "The {DASH} prototype: implementation and performance", crossref = "ACM:1998:PAI", pages = "418--429", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{vonEicken:1998:AMM, author = "Thorsten von Eicken and David E. Culler and Seth Copen Goldstein and Klaus Erik Schauser", title = "Active messages: a mechanism for integrating communication and computation", crossref = "ACM:1998:PAI", pages = "430--440", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Glass:1998:TMA, author = "Christopher J. Glass and Lionel M. Ni", title = "The turn model for adaptive routing", crossref = "ACM:1998:PAI", pages = "441--450", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Yeh:1998:AIT, author = "Tse-Yu Yeh and Yale N. Patt", title = "Alternative implementations of two-level adaptive branch prediction", crossref = "ACM:1998:PAI", pages = "451--461", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Kuck:1998:CSI, author = "D. Kuck and E. Davidson and D. Lawrie and A. Sameh and C.-Q. Zhu and A. Veidenbaum and J. Konicek and P. Yew and K. Gallivan and W. Jalby and H. Wijshoff and R. Bramley and U. M. Yang and P. Emrath and D. Padua and R. Eigenmann and J. Hoeflinger and G. Jayson and Z. Li and T. Murphy and J. Andrews", title = "The {Cedar} system and an initial performance study", crossref = "ACM:1998:PAI", pages = "462--472", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Blumrich:1998:VMM, author = "Matthias A. Blumrich and Kai Li and Richard Alpert and Cezary Dubnicki and Edward W. Felten and Jonathan Sandberg", title = "Virtual memory mapped network interface for the {SHRIMP} multicomputer", crossref = "ACM:1998:PAI", pages = "473--484", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Kuskin:1998:SFM, author = "Jeffrey Kuskin and David Ofelt and Mark Heinrich and John Heinlein and Richard Simoni and K. Gharachorloo and J. Chapin and D. Nakahira and J. Baxter and M. Horowitz and A. Gupta and M. Rosenblum and J. Hennessy", title = "The {Stanford FLASH} multiprocessor", crossref = "ACM:1998:PAI", pages = "485--496", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Reinhardt:1998:TTU, author = "Steven K. Reinhardt and James R. Larus and David A. Wood", title = "{Tempest} and {Typhoon}: user-level shared memory", crossref = "ACM:1998:PAI", pages = "497--508", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Agarwal:1998:AMA, author = "Anant Agarwal and Ricardo Bianchini and David Chaiken and Kirk L. Johnson and David Kranz and J. Kubiatowicz and B.-H. Lim and K. Mackenzie and D. Yeung", title = "The {MIT Alewife} machine: architecture and performance", crossref = "ACM:1998:PAI", pages = "509--520", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Sohi:1998:MP, author = "Gurindar S. Sohi and Scott E. Breach and T. N. Vijaykumar", title = "Multiscalar processors", crossref = "ACM:1998:PAI", pages = "521--532", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Tullsen:1998:SMM, author = "Dean M. Tullsen and Susan J. Eggers and Henry M. Levy", title = "Simultaneous multithreading: maximizing on-chip parallelism", crossref = "ACM:1998:PAI", pages = "533--544", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } %%% ==================================================================== %%% Cross-referenced entries must come last: @Proceedings{Lipovski:1973:PFA, editor = "G. Jack Lipovski and Stephen Anthony Szygenda", booktitle = "{Proceedings of the First Annual Symposium on Computer Architecture, December 9--11, 1973, University of Florida, Gainesville, Florida}", title = "{Proceedings of the First Annual Symposium on Computer Architecture, December 9--11, 1973, University of Florida, Gainesville, Florida}", volume = "2(4)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "iv + 277", year = "1973", CODEN = "CANED2, CPAADU", ISBN = "????", ISBN-13 = "????", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE), 0149-7111", LCCN = "TK7885.A1", bibdate = "Fri May 12 14:36:31 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "IEEE catalog no. 73CH0824-3C.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=800123", acknowledgement = ack-nhfb, } @Proceedings{King:1975:CPA, editor = "Willis K. King", booktitle = "{Conference Proceedings: 2nd Annual Symposium on Computer Architecture, Houston, Texas, January 20--22, 1975}", title = "{Conference Proceedings: 2nd Annual Symposium on Computer Architecture, Houston, Texas, January 20--22, 1975}", volume = "3(4)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "vi + 231", year = "1975", CODEN = "CANED2, CPAADU", ISBN = "????", ISBN-13 = "????", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE), 0149-7111", LCCN = "????", bibdate = "Fri May 12 14:27:32 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=642089", acknowledgement = ack-nhfb, } @Proceedings{IEEE:1976:CPA, editor = "{IEEE}", booktitle = "{Conference Proceedings: 3rd Annual Symposium on Computer Architecture, Clearwater, Florida, January 19--21, 1976}", title = "{Conference Proceedings: 3rd Annual Symposium on Computer Architecture, Clearwater, Florida, January 19--21, 1976}", volume = "??(??)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "????", year = "1976", CODEN = "CANED2, CPAADU", ISBN = "????", ISBN-13 = "????", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE), 0149-7111", LCCN = "????", bibdate = "Fri May 12 14:20:44 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "IEEE no. 75CH1043-5C.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=800110", acknowledgement = ack-nhfb, } @Proceedings{IEEE:1977:CPA, editor = "{IEEE}", booktitle = "{Conference Proceedings: 4th Annual Symposium on Computer Architecture, Silver Spring, Maryland, March 23--25, 1977}", title = "{Conference Proceedings: 4th Annual Symposium on Computer Architecture, Silver Spring, Maryland, March 23--25, 1977}", volume = "??(??)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "ix + 438", year = "1977", CODEN = "CANED2, CPAADU", ISBN = "????", ISBN-13 = "????", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE), 0149-7111", LCCN = "QA76.9.A73 S97 1977", bibdate = "Fri May 12 14:22:57 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "IEEE no. 77 CH1182-5C.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=800255", acknowledgement = ack-nhfb, } @Proceedings{IEEE:1979:CPA, editor = "{IEEE}", booktitle = "{Conference Proceedings: 5th Annual Symposium on Computer Architecture, Palo Alto, California, April 23--25, 1979}", title = "{Conference Proceedings: 5th Annual Symposium on Computer Architecture, Palo Alto, California, April 23--25, 1979}", volume = "6(7)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "????", year = "1979", CODEN = "CANED2, CPAADU", ISBN = "????", ISBN-13 = "????", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE), 0149-7111", LCCN = "????", bibdate = "Fri May 12 14:22:57 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=800094", acknowledgement = ack-nhfb, } @Proceedings{ACM:1980:CPA, editor = "{ACM}", booktitle = "{Conference Proceedings: 7th Annual Symposium on Computer Architecture, La Baule, France, 6--8 May 1980}", title = "{Conference Proceedings: 7th Annual Symposium on Computer Architecture, La Baule, France, 6--8 May 1980}", volume = "8(3)", publisher = pub-ACM, address = pub-ACM:adr, pages = "333", year = "1980", CODEN = "CANED2, CPAADU", ISBN = "????", ISBN-13 = "????", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE), 0149-7111", bibdate = "Fri Sep 16 10:53:10 1994", bibsource = "ftp://ftp.math.utah.edu/pub/mirrors/ftp.ira.uka.de/bibliography/Math/fparith.bib; http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=800090", acknowledgement = ack-nj, } @Proceedings{IEEE:1981:CPA, editor = "{IEEE}", booktitle = "{Conference Proceedings: 8th Annual Symposium on Computer Architecture, Minneapolis, Minnesota, May 12--14, 1981}", title = "{Conference Proceedings: 8th Annual Symposium on Computer Architecture, Minneapolis, Minnesota, May 12--14, 1981}", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "????", year = "1981", CODEN = "CANED2, CPAADU", ISBN = "????", ISBN-13 = "????", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE), 0149-7111", LCCN = "????", bibdate = "Fri May 12 14:25:51 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=800052", acknowledgement = ack-nhfb, } @Proceedings{IEEE:1982:CPA, editor = "{IEEE}", booktitle = "{Conference proceedings: the 9th annual Symposium on Computer Architecture: April 26--29, 1982, Austin, Texas}", title = "{Conference proceedings: the 9th annual Symposium on Computer Architecture: April 26--29, 1982, Austin, Texas}", volume = "10(3)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "viii + 335", year = "1982", CODEN = "CANED2, CPAADU", ISBN = "????", ISBN-13 = "????", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE), 0149-7111", LCCN = "QA76.9.A73 S97 1982", bibdate = "Fri May 12 14:17:17 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ACM order no. 415820. IEEE catalogue no. 82CH1754-1. IEEE Computer Society order no. 411.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=800048", acknowledgement = ack-nhfb, } @Proceedings{IEEE:1983:CPA, editor = "{IEEE}", booktitle = "Conference proceedings: the 10th annual International Symposium on Computer Architecture, Royal Institute of Technology, Stockholm, Sweden", title = "Conference proceedings: the 10th annual International Symposium on Computer Architecture, Royal Institute of Technology, Stockholm, Sweden", volume = "11(3)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "ix + 438", year = "1983", CODEN = "CANED2", ISBN = "0-89791-101-6", ISBN-13 = "978-0-89791-101-6", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "QA76.9.A73 .S97 1983", bibdate = "Fri May 12 13:53:44 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ACM order number 415830. IEEE catalog no. 83CH1889-5. IEEE Computer Society order no. 473.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=800046", acknowledgement = ack-nhfb, } @Proceedings{IEEE:1984:AIS, editor = "{IEEE}", booktitle = "{The 11th Annual International Symposium on Computer Architecture, June 5--7, 1984, Ann Arbor, Michigan conference proceedings}", title = "{The 11th Annual International Symposium on Computer Architecture, June 5--7, 1984, Ann Arbor, Michigan conference proceedings}", volume = "12(3)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "ix + 373", year = "1984", CODEN = "CANED2", ISBN = "0-8186-0538-3 (paperback)", ISBN-13 = "978-0-8186-0538-3 (paperback)", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "QA76.9.A73 S97 1984", bibdate = "Fri May 12 14:30:24 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ACM order no. 415840. IEEE catalog no. 84CH2051-1. IEEE Computer Society no. 538.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=800015", acknowledgement = ack-nhfb, } @Proceedings{IEEE:1985:AIS, editor = "{IEEE}", booktitle = "{The 12th Annual International Symposium on Computer Architecture, June 17--19, 1985, Boston, Massachusetts: conference proceedings}", title = "{The 12th Annual International Symposium on Computer Architecture, June 17--19, 1985, Boston, Massachusetts: conference proceedings}", volume = "13(3)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xiv + 428", year = "1985", CODEN = "CANED2", ISBN = "0-8186-0634-7", ISBN-13 = "978-0-8186-0634-2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "QA76.9.A73 C65", bibdate = "Fri May 12 13:47:45 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ACM order no. 415850. IEEE catalog no. 85CH2144-4. IEEE Computer Society order no. 634.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=327010", acknowledgement = ack-nhfb, } @Proceedings{IEEE:1986:CPT, editor = "{IEEE}", booktitle = "{Conference proceedings: the thirteenth annual International symposium on computer Architecture, June 2--5, 1986, Tokyo, Japan}", title = "{Conference proceedings: the thirteenth annual International symposium on computer Architecture, June 2--5, 1986, Tokyo, Japan}", volume = "14(2)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xiii + 454", year = "1986", CODEN = "CANED2", ISBN = "0-8186-8719-3", ISBN-13 = "978-0-8186-8719-8", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "QA76.9.A73 I56 1986", bibdate = "Fri May 12 13:51:08 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ACM order number 415860. IEEE catalogue number 86CH12291-3. IEEE Computer society order number 719.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=17407", acknowledgement = ack-nhfb, } @Proceedings{IEEE:1987:AIS, editor = "{IEEE}", booktitle = "{The 14th Annual International Symposium on Computer Architecture, June 2--5, 1987, Pittsburgh, Pennsylvania: Conference proceedings}", title = "{The 14th Annual International Symposium on Computer Architecture, June 2--5, 1987, Pittsburgh, Pennsylvania: Conference proceedings}", volume = "15(2)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xi + 321", year = "1987", CODEN = "CANED2", ISBN = "0-8186-8776-2 (casebound), 0-8186-0776-9 (paperback), 0-8186-0776-9 (microfiche), 0-8186-4776-0 (casebound)", ISBN-13 = "978-0-8186-8776-1 (casebound), 978-0-8186-0776-9 (paperback), 978-0-8186-0776-9 (microfiche), 978-0-8186-4776-5 (casebound)", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "QA76.9.A73 I56 1987", bibdate = "Fri May 12 14:07:52 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ACM Order No. 415870.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=30350", acknowledgement = ack-nhfb, } @Proceedings{IEEE:1988:AIS, editor = "{IEEE}", booktitle = "{The 15th Annual International Symposium on Computer Architecture: Conference proceedings, May 30--June 2, 1988, Honolulu, Hawaii}", title = "{The 15th Annual International Symposium on Computer Architecture: Conference proceedings, May 30--June 2, 1988, Honolulu, Hawaii}", volume = "16(2)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xi + 461", year = "1988", CODEN = "CANED2", ISBN = "0-8186-0861-7 (paperback), 0-8186-4861-9 (microfiche), 0-8186-8861-0 (case)", ISBN-13 = "978-0-8186-0861-2 (paperback), 978-0-8186-4861-8 (microfiche), 978-0-8186-8861-4 (case)", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "QA76.9.A73 C65", bibdate = "Fri May 12 14:09:39 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ACM order no. 415880.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=52400", acknowledgement = ack-nhfb, } @Proceedings{ACM:1989:PAI, editor = "{ACM}", booktitle = "{Proceedings of the 16th annual International Symposium on Computer Architecture, May 28--June 1, 1989, Jerusalem, Israel}", title = "{Proceedings of the 16th annual International Symposium on Computer Architecture, May 28--June 1, 1989, Jerusalem, Israel}", volume = "17(3)", publisher = pub-ACM, address = pub-ACM:adr, pages = "xvii + 426", year = "1989", CODEN = "CANED2", ISBN = "0-89791-319-1, 0-8186-5948-3 (microfiche), 0-8186-8948-X (casebound), 0-8186-1948-1 (paperback)", ISBN-13 = "978-0-89791-319-5; 978-0-8186-5948-5 (microfiche); 978-0-8186-8948-2 (casebound); 978-0-8186-1948-9 (paperback)", ISSN = "0163-5964; 0884-7495", LCCN = "QA76.9.A73 C65", bibdate = "Fri May 12 13:42:34 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ACM order number 415890. IEEE catalog number 89CH2705-2. IEEE Computer Society order number 1948.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=74925", acknowledgement = ack-nhfb, remark = "ISCA '89 Proceedings", } @Proceedings{IEEE:1990:PAI, editor = "{IEEE}", booktitle = "{Proceedings: the 17th annual International Symposium on Computer Architecture, May 28--31, 1990, Seattle, Washington}", title = "{Proceedings: the 17th annual International Symposium on Computer Architecture, May 28--31, 1990, Seattle, Washington}", volume = "18(2)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xv + 378", year = "1990", CODEN = "CANED2", ISBN = "0-8186-9047-X (casebound), 0-89791-366-3, 0-8186-2047-1 (paperback), 0-8186-6047-3 (microfiche)", ISBN-13 = "978-0-8186-9047-1 (casebound), 978-0-89791-366-9, 978-0-8186-2047-8 (paperback), 978-0-8186-6047-4 (microfiche)", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "QA76.9.A73 I56 1990", bibdate = "Fri May 12 14:04:34 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ACM order no. 415900.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=325164", acknowledgement = ack-nhfb, remark = "ISCA '17 Proceedings", } @Proceedings{ACM:1991:PIS, editor = "{ACM}", booktitle = "{Proceedings of the 18th International Symposium on Computer Architecture: May 27--30, 1991, Toronto, Canada}", title = "{Proceedings of the 18th International Symposium on Computer Architecture: May 27--30, 1991, Toronto, Canada}", volume = "19(3)", publisher = pub-ACM, address = pub-ACM:adr, pages = "xv + 399", year = "1991", CODEN = "CANED2", ISBN = "0-89791-394-9", ISBN-13 = "978-0-89791-394-2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "QA76.9 A73 I56 1991", bibsource = "ftp://ftp.math.utah.edu/pub/mirrors/ftp.ira.uka.de/bibliography/Os/IMMD_IV.bib; http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ACM order number 415910. IEEE catalog number 91CH2995-9. IEEE Computer Society order number 2146.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=115952", acknowledgement = ack-nhfb, remark = "ISCA '18 Proceedings", } @Proceedings{IEEE:1992:PAI, editor = "{IEEE}", booktitle = "{Proceedings, the 19th annual International Symposium on Computer Architecture: May 19--21, 1992, Gold Coast, Queensland, Australia}", title = "{Proceedings, the 19th annual International Symposium on Computer Architecture: May 19--21, 1992, Gold Coast, Queensland, Australia}", volume = "20(2)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xvi + 439", year = "1992", CODEN = "CANED2", ISBN = "0-89791-509-7 (soft cover), 0-8186-2940-1 (perfect bound), 0-8186-2942-8 (casebound), 0-8186-2941-X (microfiche)", ISBN-13 = "978-0-89791-509-0 (soft cover), 978-0-8186-2940-2 (perfect bound), 978-0-8186-2942-6 (casebound), 978-0-8186-2941-9 (microfiche)", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "QA76.9.A73 I56 1992", bibdate = "Fri May 12 13:59:17 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ACM order number 415920. IEEE catalog number 92CH3156-7. IEEE Computer Society order number 2940.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=139669", acknowledgement = ack-nhfb, remark = "ISCA '19 Proceedings", } @Proceedings{ACM:1993:AIS, editor = "{ACM}", booktitle = "{20th Annual International Symposium on Computer Architecture ISCA '20, San Diego, CA, USA, May 16--19, 1993}", title = "{20th Annual International Symposium on Computer Architecture ISCA '20, San Diego, CA, USA, May 16--19, 1993}", journal = j-COMP-ARCH-NEWS, volume = "21(2)", publisher = pub-ACM, address = pub-ACM:adr, pages = "xii + 361", month = may, year = "1993", CODEN = "CANED2", ISBN = "0-8186-3810-9 (paper), 0-8186-3811-7 (microfiche), 0-8186-3812-5 (case)", ISBN-13 = "978-0-8186-3810-7 (paper), 978-0-8186-3811-4 (microfiche), 978-0-8186-3812-1 (case)", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "QA76.9.A73 I58 1993", bibdate = "Sat Sep 28 19:27:02 MDT 1996", bibsource = "ftp://ftp.math.utah.edu/pub/tex/bib/mach.bib; http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ACM order number 415930. IEEE catalog number 93CH3284-7. IEEE Computer Society Press order number 3810-02.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=165123", acknowledgement = ack-nhfb, confsponsor = "IEEE; ACM", fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "https://dl.acm.org/loi/sigarch", } @Proceedings{IEEE:1994:PAI, editor = "{IEEE}", booktitle = "{Proceedings: the 21st Annual International Symposium on Computer Architecture, April 18--21, 1994, Chicago, Illinois}", title = "{Proceedings: the 21st Annual International Symposium on Computer Architecture, April 18--21, 1994, Chicago, Illinois}", volume = "22(2)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xii + 394", year = "1994", CODEN = "CANED2", ISBN = "0-8186-5510-0 (paper), 0-8186-5511-9 (microfiche), 0-8186-5512-7 (casebound)", ISBN-13 = "978-0-8186-5510-4 (paper), 978-0-8186-5511-1 (microfiche), 978-0-8186-5512-8 (casebound)", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "QA76.9.A73 S97 1994", bibdate = "Fri May 12 13:45:19 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=191995", acknowledgement = ack-nhfb, remark = "ISCA '21 Proceedings", } @Proceedings{ACM:1995:PAI, editor = "{ACM}", booktitle = "{Proceedings, the 22nd Annual International Symposium on Computer Architecture: June 22--24, 1995, Santa Margherita Ligure, Italy}", title = "{Proceedings, the 22nd Annual International Symposium on Computer Architecture: June 22--24, 1995, Santa Margherita Ligure, Italy}", volume = "23(2)", publisher = pub-ACM, address = pub-ACM:adr, pages = "xiii + 426", year = "1995", CODEN = "CANED2", ISBN = "0-89791-698-0", ISBN-13 = "978-0-89791-698-1", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "QA76.9.A73 I56 1995", bibdate = "Fri May 12 13:37:23 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ACM order number 415950. EEE catalog number 95CS35801. IEEE Computer Society order number PRO7677.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=223982", acknowledgement = ack-nhfb, remark = "ISCA '22", } @Proceedings{ACM:1996:PAI, editor = "{ACM}", booktitle = "{Proceedings: the 23rd Annual International Symposium on Computer Architecture, May 22--24, 1996, Philadelphia, Pennsylvania}", title = "{Proceedings: the 23rd Annual International Symposium on Computer Architecture, May 22--24, 1996, Philadelphia, Pennsylvania}", volume = "24(2)", publisher = pub-ACM, address = pub-ACM:adr, pages = "xii + 318", year = "1996", ISBN = "0-89791-786-3", ISBN-13 = "978-0-89791-786-5", LCCN = "QA76.9.A73 S97 1996", bibdate = "Fri May 12 12:36:04 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; z3950.bibsys.no:2100/BIBSYS", note = "ACM order number 415960.", series = "Computer architecture news", URL = "http://portal.acm.org/toc.cfm?id=232973", acknowledgement = ack-nhfb, remark = "ISCA '23 proceedings; FCRC '96.", } @Proceedings{ACM:1997:AIS, editor = "{ACM}", booktitle = "{The 24th Annual International Symposium on Computer Architecture, June 2--4, 1997, Denver, Colorado: conference proceedings}", title = "{The 24th Annual International Symposium on Computer Architecture, June 2--4, 1997, Denver, Colorado: conference proceedings}", volume = "25(2)", publisher = pub-ACM, address = pub-ACM:adr, pages = "vii + 350", year = "1997", CODEN = "CANED2", ISBN = "0-89791-901-7", ISBN-13 = "978-0-89791-901-2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "QA76.9.A73 S94 1997", bibdate = "Fri May 12 12:36:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; z3950.bibsys.no:2100/BIBSYS", note = "ACM order number 415974.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=264107", acknowledgement = ack-nhfb, } @Proceedings{ACM:1998:PAI, editor = "{ACM}", booktitle = "{Proceedings: the 25th Annual International Symposium on Computer Architecture, June 27--July 1, 1998, Barcelona, Spain}", title = "{Proceedings: the 25th Annual International Symposium on Computer Architecture, June 27--July 1, 1998, Barcelona, Spain}", volume = "26(3)", publisher = pub-ACM, address = pub-ACM:adr, pages = "xiii + 394", year = "1998", ISBN = "0-8186-8491-7, 0-8186-8492-5, 0-8186-8493-3", ISBN-13 = "978-0-8186-8491-3, 978-0-8186-8492-0, 978-0-8186-8493-7", LCCN = "QA76.9.A73 S97 1998", bibdate = "Fri May 12 12:36:10 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; z3950.bibsys.no:2100/BIBSYS", note = "ACM Order Number 414984. IEEE Computer Society Order Number PR08491; IEEE Order Plan Catalog Number 98CB36235.", series = "Computer architecture news", URL = "http://portal.acm.org/toc.cfm?id=279358; http://portal.acm.org/toc.cfm?id=285930", acknowledgement = ack-nhfb, remark = "ISCA '25 proceedings.", } @Proceedings{IEEE:1999:PIS, editor = "{IEEE}", booktitle = "{Proceedings of the 26th International Symposium on Computer Architecture: May 2--4, 1999, Atlanta, Georgia}", title = "{Proceedings of the 26th International Symposium on Computer Architecture: May 2--4, 1999, Atlanta, Georgia}", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xii + 317", year = "1999", CODEN = "CANED2", ISBN = "0-7695-0170-2, 0-7695-0171-0 (casebound)", ISBN-13 = "978-0-7695-0170-3, 978-0-7695-0171-0 (casebound)", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "QA76.9.A73. S9 1999", bibdate = "Fri May 12 13:33:37 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "IEEE Computer Society Order Number PR00170. IEEE Order Plan Catalog Number 98CB36367.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=300979", acknowledgement = ack-nhfb, remark = "ISCA '99 proceedings", } @Proceedings{ACM:2000:PIS, editor = "{ACM}", booktitle = "{Proceedings of the 27th International Symposium on Computer Architecture, June 12--14, 2000, Vancouver, British Columbia, Canada}", title = "{Proceedings of the 27th International Symposium on Computer Architecture, June 12--14, 2000, Vancouver, British Columbia, Canada}", volume = "28(2)", publisher = pub-ACM, address = pub-ACM:adr, pages = "vi + 327", year = "2000", ISBN = "1-58113-232-8", ISBN-13 = "978-1-58113-232-8", LCCN = "QA76.9.A73 S97 2000", bibdate = "Fri May 12 12:35:59 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; z3950.bibsys.no:2100/BIBSYS", series = "Computer architecture news", URL = "http://portal.acm.org/toc.cfm?id=339647", acknowledgement = ack-nhfb, remark = "ISCA '27 proceedings.", } @Book{Hill:2000:RCA, editor = "Mark D. (Mark Donald) Hill and Norman P. (Norman Paul) Jouppi and Gurindar Sohi", booktitle = "Readings in Computer Architecture", title = "Readings in Computer Architecture", publisher = pub-MORGAN-KAUFMANN, address = pub-MORGAN-KAUFMANN:adrsf, pages = "xviii + 717", year = "2000", ISBN = "1-55860-539-8", ISBN-13 = "978-1-55860-539-8", LCCN = "QA76.9.A73 H55 2000", bibdate = "Fri May 12 15:34:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/bibnet/authors/w/wilkes-maurice-v.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib; z3950.loc.gov:7090/Voyager", URL = "http://books.elsevier.com/bookscat/links/details.asp?isbn=1558605398; http://www.loc.gov/catdir/description/els033/99044480.html; http://www.loc.gov/catdir/toc/els033/99044480.html; https://archive.org/details/readingsincomput0000hill/page/n9/mode/2up?q=Slave+memories; https://shop.elsevier.com/books/readings-in-computer-architecture/hill/978-0-08-057364-9", acknowledgement = ack-nhfb, shorttableofcontents = "1: Classic Machines: Technology, Implementation, and Economics \\ 2: Methods \\ 3: Instruction Sets \\ 4: Instruction Level Parallelism (ILP) \\ 5: Dataflow and Multithreading \\ 6: Memory Systems \\ 7: I/O: Storage Systems, Networks, and Graphics \\ 8: Single-Instruction Multiple Data (SIMD) Parallelism \\ 9: Multiprocessors and Multicomputers \\ 10: Recent Implementations and Future Prospects", subject = "Computer architecture", tableofcontents = "PREFACE \\ CHAPTER 1: Classic Machines: Technology, Implementation, and Economics \\ G. M. Amdahl, G. A. Blaauw, F. P. Brooks, Jr., ``Architecture of the IBM System/360,'' IBM Journal of Research and Development, April 1964 \\ J. E. Thornton, ``Parallel Operation in the Control Data 6600,'' Fall Joint Computers Conference, vol. 26, pp. 33--40, 1961 \\ R. M. Russell, ``The Cray-1 Computer System'', Comm. ACM, 21, 1 (January 1978), 63--72 \\ J. Kolodzey, ``Cray-1 Computer Technology'', IEEE Transactions on Components, Hybrids, and Manufacturing Technology, p181--187, June 1981 \\ G. Moore, ``Cramming More Components onto Integrated Circuits'', Electronics, p114--117, April 1965 \\ S. Mazor, ``The History of the Microcomputer Invention and Evolution'', Proc. IEEE Dec '95, 1601--1607 \\ CHAPTER 2: Methods \\ G. M. Amdahl, ``Validity of the Single-Processor Approach to Achieving Large Scale Computing Capabilities'', AFIPS Conference Proceedings, (April 1967), 483--485 \\ M. D. Hill and A. J. Smith, ``Evaluating Associativity in CPU Caches'', IEEE Trans. on Computers, C-38, 12 (December 1989), 1612--1630 \\ J. S. Emer and D. W. Clark, ``A Characterization of Processor Performance in the VAX-11/780'', Proc. Eleventh International Symposium on Computer Architecture, Ann Arbor, MI (June 1984), 301--310 \\ CHAPTER 3: Instruction Sets \\ W. A. Wulf, ``Compilers and Computer Architecture'', IEEE Computer, 14, 7 (July 1981), 41--48 \\ G. Radin, ``The 801 Minicomputer,'' Proc. Symposium on Architectural Support for Programming Languages and Operating Systems, March 1982, 39--47 \\ D. A. Patterson and D. R. Ditzel, ``The Case for the Reduced Instruction Set Computer,'' ACM Computer Architecture News, 8, 6, 15 October 1980, 25--33 \\ R. P. Colwell, C. Y. Hitchcock, E. D. Jensen, H. M. Brinkley Sprunt, C. P. Kollar, ``Computers, Complexity, and Controversy,'' IEEE Computer, vol. 18, no. 9, September 1985 \\ J. Crawford, ``Architecture of the Intel 80386,'' Proceedings of ICCD , pp. 155--160, October 1986 \\ S. Mahlke, R. Hank, J. Mccormick, D. August, W. Hwu, ``A Comparison of Full and Partial Predicated Execution Support for ILP Processors'', Proc. 22nd Annual Symposium on Computer Architecture (June 1995), 138--150 \\ CHAPTER 4: Instruction Level Parallelism (ILP) \\ D. W. Anderson, F. J. Sparacio and R. M. Tomasulo, ``The IBM System/360 Model 91: Machine Philosophy and Instruction-Handling'', IBM Journal of Research and Development January 1967 \\ J. E. Smith and A. R. Pleszkun, ``Implementing Precise Interrupts in Pipelined Processors'', IEEE Trans. on Computers, C-37, 5 (May 1988), 562--573 \\ J. E. Smith, ``A Study of Branch Prediction Strategies'', Proc. Eighth Annual Symposium on Computer Architecture (May 1981), 135--148 \\ T.-Y. Yeh and Y. N. Patt, ``Two-Level Adaptive Branch Prediction,'' Proc. 24th Annual Workshop on Microprogramming (MICRO-24), Albuquerque, NM, (December 1991) \\ Y. N. Patt, W. W. Hwu and M. Shebanow, ``HPS, A New Microarchitecture: Introduction and Rationale,'' Proc. 18th Annual Workshop on Microprogramming, Pacific Grove, CA (December 1985), 103--108 \\ G. S. Sohi and S. Vajapeyam, ``Instruction Issue Logic for High-Performance, Interruptible Pipelined Processors'', Proc. 14th Annual Symposium on Computer Architecture (June 1987), 27--34 \\ G. F. Grohoski, ``Machine Organization of the IBM RISC System/6000 processor,'' IBM Journal of Research and Development, 34, 1 (January 1990), 37--58 \\ K. C. Yeager, ``The MIPS R10000 Superscalar Microprocessor'', IEEE Micro, 16, 2, April 1996, 28--40 \\ B. R. Rau and J. A. Fisher, ``Instruction-Level Parallel Processing: History, Overview, and Perspective'', The Journal of Supercomputing,, 7, 1, (??? 1993), 9--50. Reprinted in Rau and Fisher (ed.), ``Instruction-Level Parallelism, Kluwer Academic Publishers, 1993 \\ CHAPTER 5: Dataflow and Multithreading \\ J. B. Dennis and D. P. Misunas, ``A Preliminary Architecture for a Basic Data-Flow Processor,'' Proc. 2nd Annual Symposium on Computer Architecture, Computer Architecture News, 3, 4 (December 1974), 126--132, ACM \\ Arvind and R. S. Nikhil, ``Executing a Program on the MIT Tagged-Token Dataflow Architecture'', IEEE Trans. on Computers, 39, 3 (March 1990), 300--318 \\ B. Smith, ``Architecture and Applications of the HEP Multiprocessor Computer System'', Proc. of the Int. Soc. for Opt. Engr. (1981), 241--248 \\ D. M. Tullsen, S. J. Eggers, J. S. Emer, H. M. Levy, J. L. Lo and R. L. Stamm, ``Exploiting Choice: Instruction Fetch and Issue on an Implementable Simultaneous Multithreading Processor'', Proc. 23rd Annual Symposium on Computer Architecture (May 1996), 191--202 \\ CHAPTER 6: Memory Systems \\ M. V. Wilkes, ``Slave Memories and Dynamic Storage Allocation'', IEEE Trans. on Electronic Computers, EC-14, 2 (April 1965), 270--271 \\ J. S. Liptay, ``Structural Aspects of the System/360 Model 85, Part II: The Cache'', IBM Systems Journal,, 7, 1 (1968), 15--21 \\ D. Kroft, ``Lockup-Free Instruction Fetch/Prefetch Cache Organization'', Proc. Eighth Symposium on Computer Architecture (May 1981), 81--87 \\ J. R. Goodman, ``Using Cache Memory to Reduce Processor-Memory Traffic'', Proc. Tenth International Symposium on Computer Architecture, Stockholm, Sweden (June 1983), 124--131 \\ N. P. Jouppi, ``Improving Direct-Mapped Cache Performance by the Addition of a Small Fully-Associative Cache and Prefetch Buffers'', Proc. 17th Annual Symposium on Computer Architecture, Computer Architecture News, 18, 2 (June 1990), 364--373, ACM \\ T. Kilburn, D. B. G. Edwards, M. J. Lanigan, F. H. Sumner, ``One-Level Storage System'', IRE Transactions, EC-11, 2, (April 1962), 223--235 \\ D. W. Clark and J. S. Emer, ``Performance of the VAX-11/780 Translation Buffer: Simulation and Measurement'', ACM Trans. on Computer Systems, 3, 1 (February 1985), 31--62 \\ W. Wang, J.-L. Baer and H. M. Levy, ``Organization and Performance of a Two-Level Virtual-Real Cache Hierarchy'', Proc. 16th Annual International Symposium on Computer Architecture, Jerusalem (June 1989), 140--148 \\ CHAPTER 7: I/O: Storage Systems, Networks, and Graphics \\ M. Smotherman, ``A Sequencing-based Taxonomy of I/O Systems and Review of Historical Machines'', ACM Computer Architecture News 17:5, (September 1989), pgs 5--15. Storage Systems \\ C. Ruemmler and J. Wilkes, ``An Introduction to Disk Drive Modeling'', IEEE Computer vol 27 #3, March 1994, pgs 17--28 \\ D. A. Patterson, G. Gibson and R. H. Katz, ``A Case for Redundant Arrays of Inexpensive Disks (RAID)'', Proc. ACM SIGMOD Conference, Chicago, Illinois (June 1988). Networks \\ R. Metcalfe and D. Boggs, ``Ethernet: Distributed Packet Switching for Local Computer Networks.'' Communications of the ACM, 19(7):395--404 \\ L. Ni and P. McKinley, ``A Survey of Wormhole Routing Techniques in Direct Networks'', IEEE Computer, February 1993, vol 26 #2, pgs 62--76. Graphics \\ K. AKERLY, ``Reality Engine Graphics'', SIGGRAPH '93 Proceedings, pp 109--116 \\ CHAPTER 8: Single-Instruction Multiple Data (SIMD) Parallelism \\ M. J. Flynn, ``Very High-Speed Computing Systems'', Proceedings of the IEEE , vol. 54, no. 12, December 1966 \\ D. J. Kuck and R. A. Stokes, ``The Burroughs Scientific Processor (BSP)'', IEEE Trans. on Computers , vol. C-31, pp. 363--376, May 1982 \\ M. Gokhale, B. Holmes, K. Iobst, ``Processing in Memory: The Terasys Massively Parallel PIM Array'', IEEE Computer, 28, 4 (April 1995), 23--31 \\ CHAPTER 9: Multiprocessors and Multicomputers \\ W. A. Wulf and S. P. Harbison, ``Reflections in a pool of processors / An experience report on C.mmp/Hydra'', Proc. National Computer Conference (AFIPS) (June 1978) \\ L. Lamport, ``How to Make a Multiprocessor Computer That Correctly Executes Multiprocess Programs'', IEEE Trans. on Computers, C-28, 9 (September 1979), 690--691 \\ L. M. Censier and P. Feautrier, ``A New Solution to Coherence Problems in Multicache Systems'', IEEE Transactions on Computers, C-27, 12 (December 1978), 1112--1118 \\ D. Lenoski, J. Laudon, K. Gharachorloo, W. Weber, A. Gupta, J. Hennessy, M. Horowitz and M. Lam, ``The Stanford DASH Multiprocessor'', IEEE Computer, 25, 3 (March 1992), 63--79 \\ E. Hagersten, A. Landin, and S. Haridi, ``DDM--A Cache-Only Memory Architecture'', IEEE Computer, 25, 9 (September 1992), 44--54 \\ C. L. Seitz, ``The Cosmic Cube'', Comm. ACM (January 1985), 22--33 \\ K. Li and P. Hudak, ``Memory Coherence in Shared Virtual Memory Systems'', ACM Trans. on Computer Systems, 7, 4 (November 1989), 321--359 \\ CHAPTER 10: Recent Implementations and Future Prospects \\ D. Alpert, D. Avnon, ``Architecture of the Pentium Microprocessor'', IEEE Micro, June '93, 11--21 \\ D. Papworth, ``Tuning the Pentium Pro Micro Architecture'', IEEE Micro April '96, 8--15 \\ M. Slater, ``The Microprocessor Today'', IEEE Micro Dec '96, 32--44 \\ A. Yu, ``The Future of Microprocessors'', IEEE Micro Dec '96, 46--53.", } @Proceedings{ACM:2001:PIS, editor = "{ACM}", booktitle = "{Proceedings of the 28th International Symposium on Computer Architecture, June 30--July 4, 2001, G{\"o}teborg, Sweden}", title = "{Proceedings of the 28th International Symposium on Computer Architecture, June 30--July 4, 2001, G{\"o}teborg, Sweden}", volume = "29(2)", publisher = pub-ACM, address = pub-ACM:adr, pages = "xi + 289", year = "2001", ISBN = "0-7695-1162-7, 0-7695-1163-5, 0-7695-1164-3", ISBN-13 = "978-0-7695-1162-7, 978-0-7695-1163-4, 978-0-7695-1164-1", LCCN = "QA76.9.A73 C64 2001", bibdate = "Fri May 12 12:36:32 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; z3950.bibsys.no:2100/BIBSYS", series = "Computer architecture news", URL = "http://portal.acm.org/toc.cfm?id=379240", acknowledgement = ack-nhfb, remark = "ISCA '01 proceedings.", } @Proceedings{ACM:2002:PIS, editor = "{ACM}", booktitle = "{Proceedings of the 29th International Symposium on Computer Architecture, May 25--29, 2002, Anchorage, Alaska}", title = "{Proceedings of the 29th International Symposium on Computer Architecture, May 25--29, 2002, Anchorage, Alaska}", volume = "30(2)", publisher = pub-ACM, address = pub-ACM:adr, pages = "xv + 331", year = "2002", ISBN = "0-7695-1605-X, 0-7695-1606-8, 0-7695-1607-6", ISBN-13 = "978-0-7695-1605-9, 978-0-7695-1606-6, 978-0-7695-1607-3", LCCN = "QA76.9.A73 S97 2002", bibdate = "Fri May 12 12:36:48 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib; z3950.bibsys.no:2100/BIBSYS", series = "Computer architecture news", URL = "http://portal.acm.org/toc.cfm?id=545215", acknowledgement = ack-nhfb, remark = "ISCA '02 proceedings.", } @Proceedings{IEEE:2003:PAI, editor = "{IEEE}", booktitle = "{Proceedings: 30th Annual International Symposium on Computer Architecture: San Diego, California, USA, June 9--11, 2003: ISCA '03}", title = "{Proceedings: 30th Annual International Symposium on Computer Architecture: San Diego, California, USA, June 9--11, 2003: ISCA '03}", volume = "31(2)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xi + 448", year = "2003", CODEN = "CANED2", ISBN = "0-7695-1945-8", ISBN-13 = "978-0-7695-1945-6", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "QA76 .S93 2002", bibdate = "Fri May 12 12:35:09 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=859618", acknowledgement = ack-nhfb, } @Proceedings{ACM:2004:PAI, editor = "{ACM}", booktitle = "{Proceedings: 31st Annual International Symposium on Computer Architecture: ISCA 2004: [June 19--23, 2004, M{\"u}nchen, Germany]}", title = "{Proceedings: 31st Annual International Symposium on Computer Architecture: ISCA 2004: [June 19--23, 2004, M{\"u}nchen, Germany]}", volume = "32(2)", publisher = pub-ACM, address = pub-ACM:adr, pages = "xiv + 388", year = "2004", CODEN = "CANED2", ISBN = "0-7695-2143-6", ISBN-13 = "978-0-7695-2143-5", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "QA76.5 .S84 2004", bibdate = "Fri May 12 12:32:28 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "Includes CD-ROM.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=998680", acknowledgement = ack-nhfb, remark = "ISCA '05 Proceedings", } @Proceedings{IEEE:2005:ISC, editor = "{IEEE}", booktitle = "{32nd International Symposium on Computer Architecture: proceedings, Madison, Wisconsin, June 4--8, 2005}", title = "{32nd International Symposium on Computer Architecture: proceedings, Madison, Wisconsin, June 4--8, 2005}", volume = "33(2)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xviii + 557", year = "2005", CODEN = "CANED2", ISBN = "????", ISBN-13 = "????", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "????", bibdate = "Fri May 12 13:31:22 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "Includes CD-ROM.", series = j-COMP-ARCH-NEWS, URL = "http://portal.acm.org/toc.cfm?id=1069807", acknowledgement = ack-nhfb, remark = "ISCA '05 Proceedings", } @Proceedings{IEEE:2006:ISC, editor = "{IEEE}", booktitle = "{33rd International Symposium on Computer Architecture: proceedings, Boston, MA, USA, June 17--21, 2006}", title = "{33rd International Symposium on Computer Architecture: proceedings, Boston, MA, USA, June 17--21, 2006}", volume = "??(??)", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "????", year = "2006", CODEN = "CANED2", ISBN = "????", ISBN-13 = "????", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", LCCN = "????", bibdate = "Fri May 12 13:31:22 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/sigarch.bib", series = j-COMP-ARCH-NEWS, URL = "http://www.ece.neu.edu/conf/isca2006/", acknowledgement = ack-nhfb, remark = "ISCA 33 Proceedings", }